171 files changed, 8619 insertions, 2300 deletions
diff --git a/Documentation/ABI/testing/sysfs-class-cxl b/Documentation/ABI/testing/sysfs-class-cxl
index b07e86d4597f..7fd737eed38a 100644
--- a/Documentation/ABI/testing/sysfs-class-cxl
+++ b/Documentation/ABI/testing/sysfs-class-cxl
@@ -159,7 +159,7 @@ Description:    read only
                 Decimal value of the Per Process MMIO space length.
 Users:		https://github.com/ibm-capi/libcxl
 
-What:           /sys/class/cxl/<afu>m/pp_mmio_off
+What:           /sys/class/cxl/<afu>m/pp_mmio_off (not in a guest)
 Date:           September 2014
 Contact:        linuxppc-dev@lists.ozlabs.org
 Description:    read only
@@ -183,7 +183,7 @@ Description:    read only
                 Identifies the revision level of the PSL.
 Users:		https://github.com/ibm-capi/libcxl
 
-What:           /sys/class/cxl/<card>/base_image
+What:           /sys/class/cxl/<card>/base_image (not in a guest)
 Date:           September 2014
 Contact:        linuxppc-dev@lists.ozlabs.org
 Description:    read only
@@ -193,7 +193,7 @@ Description:    read only
                 during the initial program load.
 Users:		https://github.com/ibm-capi/libcxl
 
-What:           /sys/class/cxl/<card>/image_loaded
+What:           /sys/class/cxl/<card>/image_loaded (not in a guest)
 Date:           September 2014
 Contact:        linuxppc-dev@lists.ozlabs.org
 Description:    read only
@@ -201,7 +201,7 @@ Description:    read only
                 onto the card.
 Users:		https://github.com/ibm-capi/libcxl
 
-What:           /sys/class/cxl/<card>/load_image_on_perst
+What:           /sys/class/cxl/<card>/load_image_on_perst (not in a guest)
 Date:           December 2014
 Contact:        linuxppc-dev@lists.ozlabs.org
 Description:    read/write
@@ -224,7 +224,7 @@ Description:    write only
                 to reload the FPGA depending on load_image_on_perst.
 Users:		https://github.com/ibm-capi/libcxl
 
-What:		/sys/class/cxl/<card>/perst_reloads_same_image
+What:		/sys/class/cxl/<card>/perst_reloads_same_image (not in a guest)
 Date:		July 2015
 Contact:	linuxppc-dev@lists.ozlabs.org
 Description:	read/write
diff --git a/Documentation/powerpc/cxl.txt b/Documentation/powerpc/cxl.txt
index 205c1b81625c..d5506ba0fef7 100644
--- a/Documentation/powerpc/cxl.txt
+++ b/Documentation/powerpc/cxl.txt
@@ -116,6 +116,8 @@ Work Element Descriptor (WED)
 User API
 ========
 
+1. AFU character devices
+
     For AFUs operating in AFU directed mode, two character device
     files will be created. /dev/cxl/afu0.0m will correspond to a
     master context and /dev/cxl/afu0.0s will correspond to a slave
@@ -362,6 +364,59 @@ read
         reserved fields:
             For future extensions and padding
 
+
+2. Card character device (powerVM guest only)
+
+    In a powerVM guest, an extra character device is created for the
+    card. The device is only used to write (flash) a new image on the
+    FPGA accelerator. Once the image is written and verified, the
+    device tree is updated and the card is reset to reload the updated
+    image.
+
+open
+----
+
+    Opens the device and allocates a file descriptor to be used with
+    the rest of the API. The device can only be opened once.
+
+ioctl
+-----
+
+CXL_IOCTL_DOWNLOAD_IMAGE:
+CXL_IOCTL_VALIDATE_IMAGE:
+    Starts and controls flashing a new FPGA image. Partial
+    reconfiguration is not supported (yet), so the image must contain
+    a copy of the PSL and AFU(s). Since an image can be quite large,
+    the caller may have to iterate, splitting the image in smaller
+    chunks.
+
+    Takes a pointer to a struct cxl_adapter_image:
+        struct cxl_adapter_image {
+            __u64 flags;
+            __u64 data;
+            __u64 len_data;
+            __u64 len_image;
+            __u64 reserved1;
+            __u64 reserved2;
+            __u64 reserved3;
+            __u64 reserved4;
+        };
+
+    flags:
+        These flags indicate which optional fields are present in
+        this struct. Currently all fields are mandatory.
+
+    data:
+        Pointer to a buffer with part of the image to write to the
+        card.
+
+    len_data:
+        Size of the buffer pointed to by data.
+
+    len_image:
+        Full size of the image.
+
+
 Sysfs Class
 ===========
 
diff --git a/MAINTAINERS b/MAINTAINERS
index 7f1fa4ff300a..74bbff3dda52 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4230,13 +4230,6 @@ M:	Maxim Levitsky <maximlevitsky@gmail.com>
 S:	Maintained
 F:	drivers/media/rc/ene_ir.*
 
-ENHANCED ERROR HANDLING (EEH)
-M:	Gavin Shan <shangw@linux.vnet.ibm.com>
-L:	linuxppc-dev@lists.ozlabs.org
-S:	Supported
-F:	Documentation/powerpc/eeh-pci-error-recovery.txt
-F:	arch/powerpc/kernel/eeh*.c
-
 EPSON S1D13XXX FRAMEBUFFER DRIVER
 M:	Kristoffer Ericson <kristoffer.ericson@gmail.com>
 S:	Maintained
@@ -8252,6 +8245,15 @@ L:	linux-pci@vger.kernel.org
 S:	Supported
 F:	Documentation/PCI/pci-error-recovery.txt
 
+PCI ENHANCED ERROR HANDLING (EEH) FOR POWERPC
+M:	Russell Currey <ruscur@russell.cc>
+L:	linuxppc-dev@lists.ozlabs.org
+S:	Supported
+F:	Documentation/powerpc/eeh-pci-error-recovery.txt
+F:	arch/powerpc/kernel/eeh*.c
+F:	arch/powerpc/platforms/*/eeh*.c
+F:	arch/powerpc/include/*/eeh*.h
+
 PCI SUBSYSTEM
 M:	Bjorn Helgaas <bhelgaas@google.com>
 L:	linux-pci@vger.kernel.org
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 91da283cd658..5130fa166a93 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -304,7 +304,7 @@ config ZONE_DMA32
 config PGTABLE_LEVELS
 	int
 	default 2 if !PPC64
-	default 3 if PPC_64K_PAGES
+	default 3 if PPC_64K_PAGES && !PPC_BOOK3S_64
 	default 4
 
 source "init/Kconfig"
@@ -576,7 +576,7 @@ choice
 
 config PPC_4K_PAGES
 	bool "4k page size"
-	select HAVE_ARCH_SOFT_DIRTY if CHECKPOINT_RESTORE && PPC_BOOK3S
+	select HAVE_ARCH_SOFT_DIRTY if PPC_BOOK3S_64
 
 config PPC_16K_PAGES
 	bool "16k page size"
@@ -585,7 +585,7 @@ config PPC_16K_PAGES
 config PPC_64K_PAGES
 	bool "64k page size"
 	depends on !PPC_FSL_BOOK3E && (44x || PPC_STD_MMU_64 || PPC_BOOK3E_64)
-	select HAVE_ARCH_SOFT_DIRTY if CHECKPOINT_RESTORE && PPC_BOOK3S
+	select HAVE_ARCH_SOFT_DIRTY if PPC_BOOK3S_64
 
 config PPC_256K_PAGES
 	bool "256k page size"
diff --git a/arch/powerpc/boot/rs6000.h b/arch/powerpc/boot/rs6000.h
index 433f45084e41..d70517ccc0f7 100644
--- a/arch/powerpc/boot/rs6000.h
+++ b/arch/powerpc/boot/rs6000.h
@@ -239,5 +239,5 @@ struct external_reloc {
 #define DEFAULT_DATA_SECTION_ALIGNMENT 4
 #define DEFAULT_BSS_SECTION_ALIGNMENT 4
 #define DEFAULT_TEXT_SECTION_ALIGNMENT 4
-/* For new sections we havn't heard of before */
+/* For new sections we haven't heard of before */
 #define DEFAULT_SECTION_ALIGNMENT 4
diff --git a/arch/powerpc/boot/treeboot-akebono.c b/arch/powerpc/boot/treeboot-akebono.c
index b73174c34fe4..bcc5902f8462 100644
--- a/arch/powerpc/boot/treeboot-akebono.c
+++ b/arch/powerpc/boot/treeboot-akebono.c
@@ -38,7 +38,7 @@
 
 BSS_STACK(4096);
 
-#define SPRN_PIR	0x11E	/* Processor Indentification Register */
+#define SPRN_PIR	0x11E	/* Processor Identification Register */
 #define USERDATA_LEN	256	/* Length of userdata passed in by PIBS */
 #define MAX_RANKS	0x4
 #define DDR3_MR0CF	0x80010011U
diff --git a/arch/powerpc/boot/treeboot-currituck.c b/arch/powerpc/boot/treeboot-currituck.c
index 925ae43b7467..303d2074ee56 100644
--- a/arch/powerpc/boot/treeboot-currituck.c
+++ b/arch/powerpc/boot/treeboot-currituck.c
@@ -80,7 +80,7 @@ static void ibm_currituck_fixups(void)
 	}
 }
 
-#define SPRN_PIR	0x11E	/* Processor Indentification Register */
+#define SPRN_PIR	0x11E	/* Processor Identification Register */
 void platform_init(void)
 {
 	unsigned long end_of_ram, avail_ram;
diff --git a/arch/powerpc/boot/treeboot-iss4xx.c b/arch/powerpc/boot/treeboot-iss4xx.c
index 329e710feda2..733f8bf25184 100644
--- a/arch/powerpc/boot/treeboot-iss4xx.c
+++ b/arch/powerpc/boot/treeboot-iss4xx.c
@@ -59,7 +59,7 @@ static void *iss_4xx_vmlinux_alloc(unsigned long size)
 	return (void *)ibm4xx_memstart;
 }
 
-#define SPRN_PIR	0x11E	/* Processor Indentification Register */
+#define SPRN_PIR	0x11E	/* Processor Identification Register */
 void platform_init(void)
 {
 	unsigned long end_of_ram = 0x08000000;
diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig
new file mode 100644
index 000000000000..045031048f8d
--- /dev/null
+++ b/arch/powerpc/configs/powernv_defconfig
@@ -0,0 +1,313 @@
+CONFIG_PPC64=y
+CONFIG_SMP=y
+CONFIG_NR_CPUS=2048
+CONFIG_CPU_LITTLE_ENDIAN=y
+CONFIG_SYSVIPC=y
+CONFIG_POSIX_MQUEUE=y
+CONFIG_FHANDLE=y
+CONFIG_AUDIT=y
+CONFIG_IRQ_DOMAIN_DEBUG=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+CONFIG_NUMA_BALANCING=y
+CONFIG_CGROUPS=y
+CONFIG_MEMCG=y
+CONFIG_MEMCG_SWAP=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CPUSETS=y
+CONFIG_CGROUP_DEVICE=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_PERF=y
+CONFIG_USER_NS=y
+CONFIG_BLK_DEV_INITRD=y
+# CONFIG_COMPAT_BRK is not set
+CONFIG_PROFILING=y
+CONFIG_OPROFILE=y
+CONFIG_KPROBES=y
+CONFIG_JUMP_LABEL=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODVERSIONS=y
+CONFIG_MODULE_SRCVERSION_ALL=y
+CONFIG_PARTITION_ADVANCED=y
+CONFIG_OPAL_PRD=y
+# CONFIG_PPC_PSERIES is not set
+# CONFIG_PPC_OF_BOOT_TRAMPOLINE is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
+CONFIG_CPU_IDLE=y
+CONFIG_HZ_100=y
+CONFIG_BINFMT_MISC=m
+CONFIG_PPC_TRANSACTIONAL_MEM=y
+CONFIG_HOTPLUG_CPU=y
+CONFIG_KEXEC=y
+CONFIG_IRQ_ALL_CPUS=y
+CONFIG_NUMA=y
+CONFIG_MEMORY_HOTPLUG=y
+CONFIG_MEMORY_HOTREMOVE=y
+CONFIG_KSM=y
+CONFIG_TRANSPARENT_HUGEPAGE=y
+CONFIG_PPC_64K_PAGES=y
+CONFIG_PPC_SUBPAGE_PROT=y
+CONFIG_SCHED_SMT=y
+CONFIG_PM=y
+CONFIG_PCI_MSI=y
+CONFIG_HOTPLUG_PCI=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_XFRM_USER=m
+CONFIG_NET_KEY=m
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_NET_IPIP=y
+CONFIG_SYN_COOKIES=y
+CONFIG_INET_AH=m
+CONFIG_INET_ESP=m
+CONFIG_INET_IPCOMP=m
+# CONFIG_IPV6 is not set
+CONFIG_NETFILTER=y
+# CONFIG_NETFILTER_ADVANCED is not set
+CONFIG_BRIDGE=m
+CONFIG_VLAN_8021Q=m
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_DEVTMPFS=y
+CONFIG_DEVTMPFS_MOUNT=y
+CONFIG_MTD=y
+CONFIG_MTD_POWERNV_FLASH=y
+CONFIG_PARPORT=m
+CONFIG_PARPORT_PC=m
+CONFIG_BLK_DEV_FD=m
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_NBD=m
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=65536
+CONFIG_VIRTIO_BLK=m
+CONFIG_IDE=y
+CONFIG_BLK_DEV_IDECD=y
+CONFIG_BLK_DEV_GENERIC=y
+CONFIG_BLK_DEV_AMD74XX=y
+CONFIG_BLK_DEV_SD=y
+CONFIG_CHR_DEV_ST=y
+CONFIG_BLK_DEV_SR=y
+CONFIG_BLK_DEV_SR_VENDOR=y
+CONFIG_CHR_DEV_SG=y
+CONFIG_SCSI_CONSTANTS=y
+CONFIG_SCSI_FC_ATTRS=y
+CONFIG_SCSI_SRP_ATTRS=y
+CONFIG_SCSI_CXGB3_ISCSI=m
+CONFIG_SCSI_CXGB4_ISCSI=m
+CONFIG_SCSI_BNX2_ISCSI=m
+CONFIG_BE2ISCSI=m
+CONFIG_SCSI_MPT2SAS=m
+CONFIG_SCSI_SYM53C8XX_2=y
+CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=0
+CONFIG_SCSI_IPR=y
+CONFIG_SCSI_QLA_FC=m
+CONFIG_SCSI_QLA_ISCSI=m
+CONFIG_SCSI_LPFC=m
+CONFIG_SCSI_VIRTIO=m
+CONFIG_SCSI_DH=y
+CONFIG_SCSI_DH_RDAC=m
+CONFIG_SCSI_DH_ALUA=m
+CONFIG_ATA=y
+CONFIG_SATA_AHCI=y
+# CONFIG_ATA_SFF is not set
+CONFIG_MD=y
+CONFIG_BLK_DEV_MD=y
+CONFIG_MD_LINEAR=y
+CONFIG_MD_RAID0=y
+CONFIG_MD_RAID1=y
+CONFIG_MD_RAID10=m
+CONFIG_MD_RAID456=m
+CONFIG_MD_MULTIPATH=m
+CONFIG_MD_FAULTY=m
+CONFIG_BLK_DEV_DM=y
+CONFIG_DM_CRYPT=m
+CONFIG_DM_SNAPSHOT=m
+CONFIG_DM_THIN_PROVISIONING=m
+CONFIG_DM_MIRROR=m
+CONFIG_DM_ZERO=m
+CONFIG_DM_MULTIPATH=m
+CONFIG_DM_MULTIPATH_QL=m
+CONFIG_DM_MULTIPATH_ST=m
+CONFIG_DM_UEVENT=y
+CONFIG_BONDING=m
+CONFIG_DUMMY=m
+CONFIG_MACVLAN=m
+CONFIG_MACVTAP=m
+CONFIG_VXLAN=m
+CONFIG_NETCONSOLE=y
+CONFIG_TUN=m
+CONFIG_VETH=m
+CONFIG_VIRTIO_NET=m
+CONFIG_VHOST_NET=m
+CONFIG_VORTEX=y
+CONFIG_ACENIC=m
+CONFIG_ACENIC_OMIT_TIGON_I=y
+CONFIG_PCNET32=y
+CONFIG_TIGON3=y
+CONFIG_BNX2X=m
+CONFIG_CHELSIO_T1=m
+CONFIG_BE2NET=m
+CONFIG_S2IO=m
+CONFIG_E100=y
+CONFIG_E1000=y
+CONFIG_E1000E=y
+CONFIG_IXGB=m
+CONFIG_IXGBE=m
+CONFIG_MLX4_EN=m
+CONFIG_MYRI10GE=m
+CONFIG_QLGE=m
+CONFIG_NETXEN_NIC=m
+CONFIG_PPP=m
+CONFIG_PPP_BSDCOMP=m
+CONFIG_PPP_DEFLATE=m
+CONFIG_PPPOE=m
+CONFIG_PPP_ASYNC=m
+CONFIG_PPP_SYNC_TTY=m
+# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
+CONFIG_INPUT_EVDEV=m
+CONFIG_INPUT_MISC=y
+# CONFIG_SERIO_SERPORT is not set
+CONFIG_DEVPTS_MULTIPLE_INSTANCES=y
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_SERIAL_JSM=m
+CONFIG_VIRTIO_CONSOLE=m
+CONFIG_IPMI_HANDLER=y
+CONFIG_IPMI_DEVICE_INTERFACE=y
+CONFIG_IPMI_POWERNV=y
+CONFIG_RAW_DRIVER=y
+CONFIG_MAX_RAW_DEVS=1024
+CONFIG_DRM=y
+CONFIG_DRM_AST=y
+CONFIG_FIRMWARE_EDID=y
+CONFIG_FB_OF=y
+CONFIG_FB_MATROX=y
+CONFIG_FB_MATROX_MILLENIUM=y
+CONFIG_FB_MATROX_MYSTIQUE=y
+CONFIG_FB_MATROX_G=y
+CONFIG_FB_RADEON=y
+CONFIG_FB_IBM_GXT4500=y
+CONFIG_LCD_PLATFORM=m
+# CONFIG_VGA_CONSOLE is not set
+CONFIG_LOGO=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB=y
+CONFIG_USB_MON=m
+CONFIG_USB_EHCI_HCD=y
+# CONFIG_USB_EHCI_HCD_PPC_OF is not set
+CONFIG_USB_OHCI_HCD=y
+CONFIG_USB_STORAGE=m
+CONFIG_NEW_LEDS=y
+CONFIG_LEDS_CLASS=m
+CONFIG_LEDS_POWERNV=m
+CONFIG_INFINIBAND=m
+CONFIG_INFINIBAND_USER_MAD=m
+CONFIG_INFINIBAND_USER_ACCESS=m
+CONFIG_INFINIBAND_MTHCA=m
+CONFIG_INFINIBAND_CXGB3=m
+CONFIG_INFINIBAND_CXGB4=m
+CONFIG_MLX4_INFINIBAND=m
+CONFIG_INFINIBAND_IPOIB=m
+CONFIG_INFINIBAND_IPOIB_CM=y
+CONFIG_INFINIBAND_SRP=m
+CONFIG_INFINIBAND_ISER=m
+CONFIG_RTC_CLASS=y
+CONFIG_RTC_DRV_GENERIC=y
+CONFIG_VIRTIO_PCI=m
+CONFIG_VIRTIO_BALLOON=m
+CONFIG_EXT2_FS=y
+CONFIG_EXT2_FS_XATTR=y
+CONFIG_EXT2_FS_POSIX_ACL=y
+CONFIG_EXT2_FS_SECURITY=y
+CONFIG_EXT3_FS=y
+CONFIG_EXT3_FS_POSIX_ACL=y
+CONFIG_EXT3_FS_SECURITY=y
+CONFIG_REISERFS_FS=y
+CONFIG_REISERFS_FS_XATTR=y
+CONFIG_REISERFS_FS_POSIX_ACL=y
+CONFIG_REISERFS_FS_SECURITY=y
+CONFIG_JFS_FS=m
+CONFIG_JFS_POSIX_ACL=y
+CONFIG_JFS_SECURITY=y
+CONFIG_XFS_FS=m
+CONFIG_XFS_POSIX_ACL=y
+CONFIG_BTRFS_FS=m
+CONFIG_BTRFS_FS_POSIX_ACL=y
+CONFIG_NILFS2_FS=m
+CONFIG_AUTOFS4_FS=m
+CONFIG_FUSE_FS=m
+CONFIG_OVERLAY_FS=m
+CONFIG_ISO9660_FS=y
+CONFIG_UDF_FS=m
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_PROC_KCORE=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_HUGETLBFS=y
+CONFIG_CRAMFS=m
+CONFIG_SQUASHFS=m
+CONFIG_SQUASHFS_XATTR=y
+CONFIG_SQUASHFS_LZO=y
+CONFIG_SQUASHFS_XZ=y
+CONFIG_PSTORE=y
+CONFIG_NFS_FS=y
+CONFIG_NFS_V3_ACL=y
+CONFIG_NFS_V4=y
+CONFIG_NFSD=m
+CONFIG_NFSD_V3_ACL=y
+CONFIG_NFSD_V4=y
+CONFIG_CIFS=m
+CONFIG_CIFS_XATTR=y
+CONFIG_CIFS_POSIX=y
+CONFIG_NLS_DEFAULT="utf8"
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_ASCII=y
+CONFIG_NLS_ISO8859_1=y
+CONFIG_NLS_UTF8=y
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUG_KERNEL=y
+CONFIG_DEBUG_STACK_USAGE=y
+CONFIG_DEBUG_STACKOVERFLOW=y
+CONFIG_LOCKUP_DETECTOR=y
+CONFIG_LATENCYTOP=y
+CONFIG_SCHED_TRACER=y
+CONFIG_BLK_DEV_IO_TRACE=y
+CONFIG_CODE_PATCHING_SELFTEST=y
+CONFIG_FTR_FIXUP_SELFTEST=y
+CONFIG_MSI_BITMAP_SELFTEST=y
+CONFIG_XMON=y
+CONFIG_CRYPTO_TEST=m
+CONFIG_CRYPTO_CCM=m
+CONFIG_CRYPTO_PCBC=m
+CONFIG_CRYPTO_HMAC=y
+CONFIG_CRYPTO_MICHAEL_MIC=m
+CONFIG_CRYPTO_TGR192=m
+CONFIG_CRYPTO_WP512=m
+CONFIG_CRYPTO_ANUBIS=m
+CONFIG_CRYPTO_BLOWFISH=m
+CONFIG_CRYPTO_CAST6=m
+CONFIG_CRYPTO_KHAZAD=m
+CONFIG_CRYPTO_SALSA20=m
+CONFIG_CRYPTO_SERPENT=m
+CONFIG_CRYPTO_TEA=m
+CONFIG_CRYPTO_TWOFISH=m
+CONFIG_CRYPTO_LZO=m
+CONFIG_CRYPTO_DEV_NX=y
+CONFIG_VIRTUALIZATION=y
+CONFIG_KVM_BOOK3S_64=m
+CONFIG_KVM_BOOK3S_64_HV=m
diff --git a/arch/powerpc/crypto/aes-spe-core.S b/arch/powerpc/crypto/aes-spe-core.S
index 5dc6bce90a77..bc6ff43a9889 100644
--- a/arch/powerpc/crypto/aes-spe-core.S
+++ b/arch/powerpc/crypto/aes-spe-core.S
@@ -61,7 +61,7 @@
  * via bl/blr. It expects that caller has pre-xored input data with first
  * 4 words of encryption key into rD0-rD3. Pointer/counter registers must
  * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
- * and rW0-rW3 and caller must execute a final xor on the ouput registers.
+ * and rW0-rW3 and caller must execute a final xor on the output registers.
  * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
  *
  */
@@ -209,7 +209,7 @@ ppc_encrypt_block_loop:
  * via bl/blr. It expects that caller has pre-xored input data with first
  * 4 words of encryption key into rD0-rD3. Pointer/counter registers must
  * have also been set up before (rT0, rKP, CTR). Output is stored in rD0-rD3
- * and rW0-rW3 and caller must execute a final xor on the ouput registers.
+ * and rW0-rW3 and caller must execute a final xor on the output registers.
  * All working registers rD0-rD3 & rW0-rW7 are overwritten during processing.
  *
  */
diff --git a/arch/powerpc/crypto/aes-spe-glue.c b/arch/powerpc/crypto/aes-spe-glue.c
index 93ee046d12cd..6d99ebf2ea15 100644
--- a/arch/powerpc/crypto/aes-spe-glue.c
+++ b/arch/powerpc/crypto/aes-spe-glue.c
@@ -32,7 +32,7 @@
  * 16 byte block block or 25 cycles per byte. Thus 768 bytes of input data
  * will need an estimated maximum of 20,000 cycles. Headroom for cache misses
  * included. Even with the low end model clocked at 667 MHz this equals to a
- * critical time window of less than 30us. The value has been choosen to
+ * critical time window of less than 30us. The value has been chosen to
  * process a 512 byte disk block in one or a large 1400 bytes IPsec network
  * packet in two runs.
  *
diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h
index 55f106ed12bf..ae0751ef8788 100644
--- a/arch/powerpc/include/asm/atomic.h
+++ b/arch/powerpc/include/asm/atomic.h
@@ -12,6 +12,24 @@
 
 #define ATOMIC_INIT(i)		{ (i) }
 
+/*
+ * Since *_return_relaxed and {cmp}xchg_relaxed are implemented with
+ * a "bne-" instruction at the end, so an isync is enough as a acquire barrier
+ * on the platform without lwsync.
+ */
+#define __atomic_op_acquire(op, args...)				\
+({									\
+	typeof(op##_relaxed(args)) __ret  = op##_relaxed(args);		\
+	__asm__ __volatile__(PPC_ACQUIRE_BARRIER "" : : : "memory");	\
+	__ret;								\
+})
+
+#define __atomic_op_release(op, args...)				\
+({									\
+	__asm__ __volatile__(PPC_RELEASE_BARRIER "" : : : "memory");	\
+	op##_relaxed(args);						\
+})
+
 static __inline__ int atomic_read(const atomic_t *v)
 {
 	int t;
@@ -42,27 +60,27 @@ static __inline__ void atomic_##op(int a, atomic_t *v)			\
 	: "cc");							\
 }									\
 
-#define ATOMIC_OP_RETURN(op, asm_op)					\
-static __inline__ int atomic_##op##_return(int a, atomic_t *v)		\
+#define ATOMIC_OP_RETURN_RELAXED(op, asm_op)				\
+static inline int atomic_##op##_return_relaxed(int a, atomic_t *v)	\
 {									\
 	int t;								\
 									\
 	__asm__ __volatile__(						\
-	PPC_ATOMIC_ENTRY_BARRIER					\
-"1:	lwarx	%0,0,%2		# atomic_" #op "_return\n"		\
-	#asm_op " %0,%1,%0\n"						\
-	PPC405_ERR77(0,%2)						\
-"	stwcx.	%0,0,%2 \n"						\
+"1:	lwarx	%0,0,%3		# atomic_" #op "_return_relaxed\n"	\
+	#asm_op " %0,%2,%0\n"						\
+	PPC405_ERR77(0, %3)						\
+"	stwcx.	%0,0,%3\n"						\
 "	bne-	1b\n"							\
-	PPC_ATOMIC_EXIT_BARRIER						\
-	: "=&r" (t)							\
+	: "=&r" (t), "+m" (v->counter)					\
 	: "r" (a), "r" (&v->counter)					\
-	: "cc", "memory");						\
+	: "cc");							\
 									\
 	return t;							\
 }
 
-#define ATOMIC_OPS(op, asm_op) ATOMIC_OP(op, asm_op) ATOMIC_OP_RETURN(op, asm_op)
+#define ATOMIC_OPS(op, asm_op)						\
+	ATOMIC_OP(op, asm_op)						\
+	ATOMIC_OP_RETURN_RELAXED(op, asm_op)
 
 ATOMIC_OPS(add, add)
 ATOMIC_OPS(sub, subf)
@@ -71,8 +89,11 @@ ATOMIC_OP(and, and)
 ATOMIC_OP(or, or)
 ATOMIC_OP(xor, xor)
 
+#define atomic_add_return_relaxed atomic_add_return_relaxed
+#define atomic_sub_return_relaxed atomic_sub_return_relaxed
+
 #undef ATOMIC_OPS
-#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP_RETURN_RELAXED
 #undef ATOMIC_OP
 
 #define atomic_add_negative(a, v)	(atomic_add_return((a), (v)) < 0)
@@ -92,21 +113,19 @@ static __inline__ void atomic_inc(atomic_t *v)
 	: "cc", "xer");
 }
 
-static __inline__ int atomic_inc_return(atomic_t *v)
+static __inline__ int atomic_inc_return_relaxed(atomic_t *v)
 {
 	int t;
 
 	__asm__ __volatile__(
-	PPC_ATOMIC_ENTRY_BARRIER
-"1:	lwarx	%0,0,%1		# atomic_inc_return\n\
-	addic	%0,%0,1\n"
-	PPC405_ERR77(0,%1)
-"	stwcx.	%0,0,%1 \n\
-	bne-	1b"
-	PPC_ATOMIC_EXIT_BARRIER
-	: "=&r" (t)
+"1:	lwarx	%0,0,%2		# atomic_inc_return_relaxed\n"
+"	addic	%0,%0,1\n"
+	PPC405_ERR77(0, %2)
+"	stwcx.	%0,0,%2\n"
+"	bne-	1b"
+	: "=&r" (t), "+m" (v->counter)
 	: "r" (&v->counter)
-	: "cc", "xer", "memory");
+	: "cc", "xer");
 
 	return t;
 }
@@ -136,27 +155,34 @@ static __inline__ void atomic_dec(atomic_t *v)
 	: "cc", "xer");
 }
 
-static __inline__ int atomic_dec_return(atomic_t *v)
+static __inline__ int atomic_dec_return_relaxed(atomic_t *v)
 {
 	int t;
 
 	__asm__ __volatile__(
-	PPC_ATOMIC_ENTRY_BARRIER
-"1:	lwarx	%0,0,%1		# atomic_dec_return\n\
-	addic	%0,%0,-1\n"
-	PPC405_ERR77(0,%1)
-"	stwcx.	%0,0,%1\n\
-	bne-	1b"
-	PPC_ATOMIC_EXIT_BARRIER
-	: "=&r" (t)
+"1:	lwarx	%0,0,%2		# atomic_dec_return_relaxed\n"
+"	addic	%0,%0,-1\n"
+	PPC405_ERR77(0, %2)
+"	stwcx.	%0,0,%2\n"
+"	bne-	1b"
+	: "=&r" (t), "+m" (v->counter)
 	: "r" (&v->counter)
-	: "cc", "xer", "memory");
+	: "cc", "xer");
 
 	return t;
 }
 
+#define atomic_inc_return_relaxed atomic_inc_return_relaxed
+#define atomic_dec_return_relaxed atomic_dec_return_relaxed
+
 #define atomic_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n)))
+#define atomic_cmpxchg_relaxed(v, o, n) \
+	cmpxchg_relaxed(&((v)->counter), (o), (n))
+#define atomic_cmpxchg_acquire(v, o, n) \
+	cmpxchg_acquire(&((v)->counter), (o), (n))
+
 #define atomic_xchg(v, new) (xchg(&((v)->counter), new))
+#define atomic_xchg_relaxed(v, new) xchg_relaxed(&((v)->counter), (new))
 
 /**
  * __atomic_add_unless - add unless the number is a given value
@@ -285,26 +311,27 @@ static __inline__ void atomic64_##op(long a, atomic64_t *v)		\
 	: "cc");							\
 }
 
-#define ATOMIC64_OP_RETURN(op, asm_op)					\
-static __inline__ long atomic64_##op##_return(long a, atomic64_t *v)	\
+#define ATOMIC64_OP_RETURN_RELAXED(op, asm_op)				\
+static inline long							\
+atomic64_##op##_return_relaxed(long a, atomic64_t *v)			\
 {									\
 	long t;								\
 									\
 	__asm__ __volatile__(						\
-	PPC_ATOMIC_ENTRY_BARRIER					\
-"1:	ldarx	%0,0,%2		# atomic64_" #op "_return\n"		\
-	#asm_op " %0,%1,%0\n"						\
-"	stdcx.	%0,0,%2 \n"						\
+"1:	ldarx	%0,0,%3		# atomic64_" #op "_return_relaxed\n"	\
+	#asm_op " %0,%2,%0\n"						\
+"	stdcx.	%0,0,%3\n"						\
 "	bne-	1b\n"							\
-	PPC_ATOMIC_EXIT_BARRIER						\
-	: "=&r" (t)							\
+	: "=&r" (t), "+m" (v->counter)					\
 	: "r" (a), "r" (&v->counter)					\
-	: "cc", "memory");						\
+	: "cc");							\
 									\
 	return t;							\
 }
 
-#define ATOMIC64_OPS(op, asm_op) ATOMIC64_OP(op, asm_op) ATOMIC64_OP_RETURN(op, asm_op)
+#define ATOMIC64_OPS(op, asm_op)					\
+	ATOMIC64_OP(op, asm_op)						\
+	ATOMIC64_OP_RETURN_RELAXED(op, asm_op)
 
 ATOMIC64_OPS(add, add)
 ATOMIC64_OPS(sub, subf)
@@ -312,8 +339,11 @@ ATOMIC64_OP(and, and)
 ATOMIC64_OP(or, or)
 ATOMIC64_OP(xor, xor)
 
-#undef ATOMIC64_OPS
-#undef ATOMIC64_OP_RETURN
+#define atomic64_add_return_relaxed atomic64_add_return_relaxed
+#define atomic64_sub_return_relaxed atomic64_sub_return_relaxed
+
+#undef ATOPIC64_OPS
+#undef ATOMIC64_OP_RETURN_RELAXED
 #undef ATOMIC64_OP
 
 #define atomic64_add_negative(a, v)	(atomic64_add_return((a), (v)) < 0)
@@ -332,20 +362,18 @@ static __inline__ void atomic64_inc(atomic64_t *v)
 	: "cc", "xer");
 }
 
-static __inline__ long atomic64_inc_return(atomic64_t *v)
+static __inline__ long atomic64_inc_return_relaxed(atomic64_t *v)
 {
 	long t;
 
 	__asm__ __volatile__(
-	PPC_ATOMIC_ENTRY_BARRIER
-"1:	ldarx	%0,0,%1		# atomic64_inc_return\n\
-	addic	%0,%0,1\n\
-	stdcx.	%0,0,%1 \n\
-	bne-	1b"
-	PPC_ATOMIC_EXIT_BARRIER
-	: "=&r" (t)
+"1:	ldarx	%0,0,%2		# atomic64_inc_return_relaxed\n"
+"	addic	%0,%0,1\n"
+"	stdcx.	%0,0,%2\n"
+"	bne-	1b"
+	: "=&r" (t), "+m" (v->counter)
 	: "r" (&v->counter)
-	: "cc", "xer", "memory");
+	: "cc", "xer");
 
 	return t;
 }
@@ -374,24 +402,25 @@ static __inline__ void atomic64_dec(atomic64_t *v)
 	: "cc", "xer");
 }
 
-static __inline__ long atomic64_dec_return(atomic64_t *v)
+static __inline__ long atomic64_dec_return_relaxed(atomic64_t *v)
 {
 	long t;
 
 	__asm__ __volatile__(
-	PPC_ATOMIC_ENTRY_BARRIER
-"1:	ldarx	%0,0,%1		# atomic64_dec_return\n\
-	addic	%0,%0,-1\n\
-	stdcx.	%0,0,%1\n\
-	bne-	1b"
-	PPC_ATOMIC_EXIT_BARRIER
-	: "=&r" (t)
+"1:	ldarx	%0,0,%2		# atomic64_dec_return_relaxed\n"
+"	addic	%0,%0,-1\n"
+"	stdcx.	%0,0,%2\n"
+"	bne-	1b"
+	: "=&r" (t), "+m" (v->counter)
 	: "r" (&v->counter)
-	: "cc", "xer", "memory");
+	: "cc", "xer");
 
 	return t;
 }
 
+#define atomic64_inc_return_relaxed atomic64_inc_return_relaxed
+#define atomic64_dec_return_relaxed atomic64_dec_return_relaxed
+
 #define atomic64_sub_and_test(a, v)	(atomic64_sub_return((a), (v)) == 0)
 #define atomic64_dec_and_test(v)	(atomic64_dec_return((v)) == 0)
 
@@ -420,7 +449,13 @@ static __inline__ long atomic64_dec_if_positive(atomic64_t *v)
 }
 
 #define atomic64_cmpxchg(v, o, n) (cmpxchg(&((v)->counter), (o), (n)))
+#define atomic64_cmpxchg_relaxed(v, o, n) \
+	cmpxchg_relaxed(&((v)->counter), (o), (n))
+#define atomic64_cmpxchg_acquire(v, o, n) \
+	cmpxchg_acquire(&((v)->counter), (o), (n))
+
 #define atomic64_xchg(v, new) (xchg(&((v)->counter), new))
+#define atomic64_xchg_relaxed(v, new) xchg_relaxed(&((v)->counter), (new))
 
 /**
  * atomic64_add_unless - add unless the number is a given value
diff --git a/arch/powerpc/include/asm/mmu-hash32.h b/arch/powerpc/include/asm/book3s/32/mmu-hash.h
index 16f513e5cbd7..16f513e5cbd7 100644
--- a/arch/powerpc/include/asm/mmu-hash32.h
+++ b/arch/powerpc/include/asm/book3s/32/mmu-hash.h
diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h
index ea0414d6659e..5f08a0832238 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-4k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h
@@ -52,44 +52,14 @@
 			 _PAGE_F_SECOND | _PAGE_F_GIX)
 
 /* shift to put page number into pte */
-#define PTE_RPN_SHIFT	(18)
+#define PTE_RPN_SHIFT	(12)
+#define PTE_RPN_SIZE	(45)	/* gives 57-bit real addresses */
 
 #define _PAGE_4K_PFN		0
 #ifndef __ASSEMBLY__
 /*
- * 4-level page tables related bits
+ * On all 4K setups, remap_4k_pfn() equates to remap_pfn_range()
  */
-
-#define pgd_none(pgd)		(!pgd_val(pgd))
-#define pgd_bad(pgd)		(pgd_val(pgd) == 0)
-#define pgd_present(pgd)	(pgd_val(pgd) != 0)
-#define pgd_page_vaddr(pgd)	(pgd_val(pgd) & ~PGD_MASKED_BITS)
-
-static inline void pgd_clear(pgd_t *pgdp)
-{
-	*pgdp = __pgd(0);
-}
-
-static inline pte_t pgd_pte(pgd_t pgd)
-{
-	return __pte(pgd_val(pgd));
-}
-
-static inline pgd_t pte_pgd(pte_t pte)
-{
-	return __pgd(pte_val(pte));
-}
-extern struct page *pgd_page(pgd_t pgd);
-
-#define pud_offset(pgdp, addr)	\
-  (((pud_t *) pgd_page_vaddr(*(pgdp))) + \
-    (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)))
-
-#define pud_ERROR(e) \
-	pr_err("%s:%d: bad pud %08lx.\n", __FILE__, __LINE__, pud_val(e))
-
-/*
- * On all 4K setups, remap_4k_pfn() equates to remap_pfn_range() */
 #define remap_4k_pfn(vma, addr, pfn, prot)	\
 	remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE, (prot))
 
diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h
index 849bbec80f7b..0a7956a80a08 100644
--- a/arch/powerpc/include/asm/book3s/64/hash-64k.h
+++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h
@@ -1,15 +1,14 @@
 #ifndef _ASM_POWERPC_BOOK3S_64_HASH_64K_H
 #define _ASM_POWERPC_BOOK3S_64_HASH_64K_H
 
-#include <asm-generic/pgtable-nopud.h>
-
 #define PTE_INDEX_SIZE  8
-#define PMD_INDEX_SIZE  10
-#define PUD_INDEX_SIZE	0
+#define PMD_INDEX_SIZE  5
+#define PUD_INDEX_SIZE	5
 #define PGD_INDEX_SIZE  12
 
 #define PTRS_PER_PTE	(1 << PTE_INDEX_SIZE)
 #define PTRS_PER_PMD	(1 << PMD_INDEX_SIZE)
+#define PTRS_PER_PUD	(1 << PUD_INDEX_SIZE)
 #define PTRS_PER_PGD	(1 << PGD_INDEX_SIZE)
 
 /* With 4k base page size, hugepage PTEs go at the PMD level */
@@ -20,13 +19,18 @@
 #define PMD_SIZE	(1UL << PMD_SHIFT)
 #define PMD_MASK	(~(PMD_SIZE-1))
 
-/* PGDIR_SHIFT determines what a third-level page table entry can map */
-#define PGDIR_SHIFT	(PMD_SHIFT + PMD_INDEX_SIZE)
+/* PUD_SHIFT determines what a third-level page table entry can map */
+#define PUD_SHIFT	(PMD_SHIFT + PMD_INDEX_SIZE)
+#define PUD_SIZE	(1UL << PUD_SHIFT)
+#define PUD_MASK	(~(PUD_SIZE-1))
+
+/* PGDIR_SHIFT determines what a fourth-level page table entry can map */
+#define PGDIR_SHIFT	(PUD_SHIFT + PUD_INDEX_SIZE)
 #define PGDIR_SIZE	(1UL << PGDIR_SHIFT)
 #define PGDIR_MASK	(~(PGDIR_SIZE-1))
 
-#define _PAGE_COMBO	0x00040000 /* this is a combo 4k page */
-#define _PAGE_4K_PFN	0x00080000 /* PFN is for a single 4k page */
+#define _PAGE_COMBO	0x00001000 /* this is a combo 4k page */
+#define _PAGE_4K_PFN	0x00002000 /* PFN is for a single 4k page */
 /*
  * Used to track subpage group valid if _PAGE_COMBO is set
  * This overloads _PAGE_F_GIX and _PAGE_F_SECOND
@@ -39,10 +43,12 @@
 
 /* Shift to put page number into pte.
  *
- * That gives us a max RPN of 34 bits, which means a max of 50 bits
- * of addressable physical space, or 46 bits for the special 4k PFNs.
+ * That gives us a max RPN of 41 bits, which means a max of 57 bits
+ * of addressable physical space, or 53 bits for the special 4k PFNs.
  */
-#define PTE_RPN_SHIFT	(30)
+#define PTE_RPN_SHIFT	(16)
+#define PTE_RPN_SIZE	(41)
+
 /*
  * we support 16 fragments per PTE page of 64K size.
  */
@@ -54,13 +60,12 @@
 #define PTE_FRAG_SIZE_SHIFT  12
 #define PTE_FRAG_SIZE (1UL << PTE_FRAG_SIZE_SHIFT)
 
-/*
- * Bits to mask out from a PMD to get to the PTE page
- * PMDs point to PTE table fragments which are PTE_FRAG_SIZE aligned.
- */
-#define PMD_MASKED_BITS		(PTE_FRAG_SIZE - 1)
-/* Bits to mask out from a PGD/PUD to get to the PMD page */
-#define PUD_MASKED_BITS		0x1ff
+/* Bits to mask out from a PMD to get to the PTE page */
+#define PMD_MASKED_BITS		0xc0000000000000ffUL
+/* Bits to mask out from a PUD to get to the PMD page */
+#define PUD_MASKED_BITS		0xc0000000000000ffUL
+/* Bits to mask out from a PGD to get to the PUD page */
+#define PGD_MASKED_BITS		0xc0000000000000ffUL
 
 #ifndef __ASSEMBLY__
 
@@ -120,7 +125,7 @@ extern bool __rpte_sub_valid(real_pte_t rpte, unsigned long index);
 	(((pte) & _PAGE_COMBO)? MMU_PAGE_4K: MMU_PAGE_64K)
 
 #define remap_4k_pfn(vma, addr, pfn, prot)				\
-	(WARN_ON(((pfn) >= (1UL << (64 - PTE_RPN_SHIFT)))) ? -EINVAL :	\
+	(WARN_ON(((pfn) >= (1UL << PTE_RPN_SIZE))) ? -EINVAL :	\
 		remap_pfn_range((vma), (addr), (pfn), PAGE_SIZE,	\
 			__pgprot(pgprot_val((prot)) | _PAGE_4K_PFN)))
 
@@ -130,11 +135,9 @@ extern bool __rpte_sub_valid(real_pte_t rpte, unsigned long index);
 #else
 #define PMD_TABLE_SIZE	(sizeof(pmd_t) << PMD_INDEX_SIZE)
 #endif
+#define PUD_TABLE_SIZE	(sizeof(pud_t) << PUD_INDEX_SIZE)
 #define PGD_TABLE_SIZE	(sizeof(pgd_t) << PGD_INDEX_SIZE)
 
-#define pgd_pte(pgd)	(pud_pte(((pud_t){ pgd })))
-#define pte_pgd(pte)	((pgd_t)pte_pud(pte))
-
 #ifdef CONFIG_HUGETLB_PAGE
 /*
  * We have PGD_INDEX_SIZ = 12 and PTE_INDEX_SIZE = 8, so that we can have
@@ -208,30 +211,30 @@ static inline char *get_hpte_slot_array(pmd_t *pmdp)
 /*
  * The linux hugepage PMD now include the pmd entries followed by the address
  * to the stashed pgtable_t. The stashed pgtable_t contains the hpte bits.
- * [ 1 bit secondary | 3 bit hidx | 1 bit valid | 000]. We use one byte per
+ * [ 000 | 1 bit secondary | 3 bit hidx | 1 bit valid]. We use one byte per
  * each HPTE entry. With 16MB hugepage and 64K HPTE we need 256 entries and
  * with 4K HPTE we need 4096 entries. Both will fit in a 4K pgtable_t.
  *
- * The last three bits are intentionally left to zero. This memory location
+ * The top three bits are intentionally left as zero. This memory location
  * are also used as normal page PTE pointers. So if we have any pointers
  * left around while we collapse a hugepage, we need to make sure
  * _PAGE_PRESENT bit of that is zero when we look at them
  */
 static inline unsigned int hpte_valid(unsigned char *hpte_slot_array, int index)
 {
-	return (hpte_slot_array[index] >> 3) & 0x1;
+	return hpte_slot_array[index] & 0x1;
 }
 
 static inline unsigned int hpte_hash_index(unsigned char *hpte_slot_array,
 					   int index)
 {
-	return hpte_slot_array[index] >> 4;
+	return hpte_slot_array[index] >> 1;
 }
 
 static inline void mark_hpte_slot_valid(unsigned char *hpte_slot_array,
 					unsigned int index, unsigned int hidx)
 {
-	hpte_slot_array[index] = hidx << 4 | 0x1 << 3;
+	hpte_slot_array[index] = (hidx << 1) | 0x1;
 }
 
 /*
diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h
index 8d1c8162f0c1..d0ee6fcef823 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -4,8 +4,7 @@
 
 /*
  * Common bits between 4K and 64K pages in a linux-style PTE.
- * These match the bits in the (hardware-defined) PowerPC PTE as closely
- * as possible. Additional bits may be defined in pgtable-hash64-*.h
+ * Additional bits may be defined in pgtable-hash64-*.h
  *
  * Note: We only support user read/write permissions. Supervisor always
  * have full read/write to pages above PAGE_OFFSET (pages below that
@@ -14,32 +13,35 @@
  * We could create separate kernel read-only if we used the 3 PP bits
  * combinations that newer processors provide but we currently don't.
  */
-#define _PAGE_PTE		0x00001
-#define _PAGE_PRESENT		0x00002 /* software: pte contains a translation */
-#define _PAGE_BIT_SWAP_TYPE	2
-#define _PAGE_USER		0x00004 /* matches one of the PP bits */
-#define _PAGE_EXEC		0x00008 /* No execute on POWER4 and newer (we invert) */
-#define _PAGE_GUARDED		0x00010
-/* We can derive Memory coherence from _PAGE_NO_CACHE */
+#define _PAGE_BIT_SWAP_TYPE	0
+
+#define _PAGE_EXEC		0x00001 /* execute permission */
+#define _PAGE_RW		0x00002 /* read & write access allowed */
+#define _PAGE_READ		0x00004	/* read access allowed */
+#define _PAGE_USER		0x00008 /* page may be accessed by userspace */
+#define _PAGE_GUARDED		0x00010 /* G: guarded (side-effect) page */
+/* M (memory coherence) is always set in the HPTE, so we don't need it here */
 #define _PAGE_COHERENT		0x0
 #define _PAGE_NO_CACHE		0x00020 /* I: cache inhibit */
 #define _PAGE_WRITETHRU		0x00040 /* W: cache write-through */
 #define _PAGE_DIRTY		0x00080 /* C: page changed */
 #define _PAGE_ACCESSED		0x00100 /* R: page referenced */
-#define _PAGE_RW		0x00200 /* software: user write access allowed */
-#define _PAGE_HASHPTE		0x00400 /* software: pte has an associated HPTE */
+#define _PAGE_SPECIAL		0x00400 /* software: special page */
 #define _PAGE_BUSY		0x00800 /* software: PTE & hash are busy */
-#define _PAGE_F_GIX		0x07000 /* full page: hidx bits */
-#define _PAGE_F_GIX_SHIFT	12
-#define _PAGE_F_SECOND		0x08000 /* Whether to use secondary hash or not */
-#define _PAGE_SPECIAL		0x10000 /* software: special page */
 
 #ifdef CONFIG_MEM_SOFT_DIRTY
-#define _PAGE_SOFT_DIRTY	0x20000 /* software: software dirty tracking */
+#define _PAGE_SOFT_DIRTY	0x200 /* software: software dirty tracking */
 #else
-#define _PAGE_SOFT_DIRTY	0x00000
+#define _PAGE_SOFT_DIRTY	0x000
 #endif
 
+#define _PAGE_F_GIX_SHIFT	57
+#define _PAGE_F_GIX		(7ul << 57)	/* HPTE index within HPTEG */
+#define _PAGE_F_SECOND		(1ul << 60)	/* HPTE is in 2ndary HPTEG */
+#define _PAGE_HASHPTE		(1ul << 61)	/* PTE has associated HPTE */
+#define _PAGE_PTE		(1ul << 62)	/* distinguishes PTEs from pointers */
+#define _PAGE_PRESENT		(1ul << 63)	/* pte contains a translation */
+
 /*
  * We need to differentiate between explicit huge page and THP huge
  * page, since THP huge page also need to track real subpage details
@@ -132,7 +134,7 @@
  * The mask convered by the RPN must be a ULL on 32-bit platforms with
  * 64-bit PTEs
  */
-#define PTE_RPN_MASK	(~((1UL << PTE_RPN_SHIFT) - 1))
+#define PTE_RPN_MASK	(((1UL << PTE_RPN_SIZE) - 1) << PTE_RPN_SHIFT)
 /*
  * _PAGE_CHG_MASK masks of bits that are to be preserved across
  * pgprot changes
@@ -223,15 +225,17 @@
 #define PUD_BAD_BITS		(PMD_TABLE_SIZE-1)
 
 #ifndef __ASSEMBLY__
-#define	pmd_bad(pmd)		(!is_kernel_addr(pmd_val(pmd)) \
-				 || (pmd_val(pmd) & PMD_BAD_BITS))
-#define pmd_page_vaddr(pmd)	(pmd_val(pmd) & ~PMD_MASKED_BITS)
+#define	pmd_bad(pmd)		(pmd_val(pmd) & PMD_BAD_BITS)
+#define pmd_page_vaddr(pmd)	__va(pmd_val(pmd) & ~PMD_MASKED_BITS)
+
+#define	pud_bad(pud)		(pud_val(pud) & PUD_BAD_BITS)
+#define pud_page_vaddr(pud)	__va(pud_val(pud) & ~PUD_MASKED_BITS)
 
-#define	pud_bad(pud)		(!is_kernel_addr(pud_val(pud)) \
-				 || (pud_val(pud) & PUD_BAD_BITS))
-#define pud_page_vaddr(pud)	(pud_val(pud) & ~PUD_MASKED_BITS)
+/* Pointers in the page table tree are physical addresses */
+#define __pgtable_ptr_val(ptr)	__pa(ptr)
 
 #define pgd_index(address) (((address) >> (PGDIR_SHIFT)) & (PTRS_PER_PGD - 1))
+#define pud_index(address) (((address) >> (PUD_SHIFT)) & (PTRS_PER_PUD - 1))
 #define pmd_index(address) (((address) >> (PMD_SHIFT)) & (PTRS_PER_PMD - 1))
 #define pte_index(address) (((address) >> (PAGE_SHIFT)) & (PTRS_PER_PTE - 1))
 
@@ -360,8 +364,18 @@ static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
 	:"cc");
 }
 
+static inline int pgd_bad(pgd_t pgd)
+{
+	return (pgd_val(pgd) == 0);
+}
+
 #define __HAVE_ARCH_PTE_SAME
 #define pte_same(A,B)	(((pte_val(A) ^ pte_val(B)) & ~_PAGE_HPTEFLAGS) == 0)
+static inline unsigned long pgd_page_vaddr(pgd_t pgd)
+{
+	return (unsigned long)__va(pgd_val(pgd) & ~PGD_MASKED_BITS);
+}
+
 
 /* Generic accessors to PTE bits */
 static inline int pte_write(pte_t pte)		{ return !!(pte_val(pte) & _PAGE_RW);}
@@ -402,7 +416,7 @@ static inline int pte_protnone(pte_t pte)
 
 static inline int pte_present(pte_t pte)
 {
-	return pte_val(pte) & _PAGE_PRESENT;
+	return !!(pte_val(pte) & _PAGE_PRESENT);
 }
 
 /* Conversion functions: convert a page and protection to a page entry,
@@ -413,13 +427,13 @@ static inline int pte_present(pte_t pte)
  */
 static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot)
 {
-	return __pte(((pte_basic_t)(pfn) << PTE_RPN_SHIFT) |
+	return __pte((((pte_basic_t)(pfn) << PTE_RPN_SHIFT) & PTE_RPN_MASK) |
 		     pgprot_val(pgprot));
 }
 
 static inline unsigned long pte_pfn(pte_t pte)
 {
-	return pte_val(pte) >> PTE_RPN_SHIFT;
+	return (pte_val(pte) & PTE_RPN_MASK) >> PTE_RPN_SHIFT;
 }
 
 /* Generic modifiers for PTE bits */
diff --git a/arch/powerpc/include/asm/mmu-hash64.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
index 7352d3f212df..0cea4807e26f 100644
--- a/arch/powerpc/include/asm/mmu-hash64.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -114,6 +114,7 @@
 
 #define POWER7_TLB_SETS		128	/* # sets in POWER7 TLB */
 #define POWER8_TLB_SETS		512	/* # sets in POWER8 TLB */
+#define POWER9_TLB_SETS_HASH	256	/* # sets in POWER9 TLB Hash mode */
 
 #ifndef __ASSEMBLY__
 
@@ -607,6 +608,9 @@ static inline unsigned long get_kernel_vsid(unsigned long ea, int ssize)
 	context = (MAX_USER_CONTEXT) + ((ea >> 60) - 0xc) + 1;
 	return get_vsid(context, ea, ssize);
 }
+
+unsigned htab_shift_for_mem_size(unsigned long mem_size);
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_MMU_HASH64_H_ */
diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h
index 8d1c41d28318..77d3ce05798e 100644
--- a/arch/powerpc/include/asm/book3s/64/pgtable.h
+++ b/arch/powerpc/include/asm/book3s/64/pgtable.h
@@ -43,13 +43,8 @@
  */
 #ifndef __real_pte
 
-#ifdef CONFIG_STRICT_MM_TYPECHECKS
 #define __real_pte(e,p)		((real_pte_t){(e)})
 #define __rpte_to_pte(r)	((r).pte)
-#else
-#define __real_pte(e,p)		(e)
-#define __rpte_to_pte(r)	(__pte(r))
-#endif
 #define __rpte_to_hidx(r,index)	(pte_val(__rpte_to_pte(r)) >>_PAGE_F_GIX_SHIFT)
 
 #define pte_iterate_hashed_subpages(rpte, psize, va, index, shift)       \
@@ -111,6 +106,26 @@ static inline void pgd_set(pgd_t *pgdp, unsigned long val)
 	*pgdp = __pgd(val);
 }
 
+static inline void pgd_clear(pgd_t *pgdp)
+{
+	*pgdp = __pgd(0);
+}
+
+#define pgd_none(pgd)		(!pgd_val(pgd))
+#define pgd_present(pgd)	(!pgd_none(pgd))
+
+static inline pte_t pgd_pte(pgd_t pgd)
+{
+	return __pte(pgd_val(pgd));
+}
+
+static inline pgd_t pte_pgd(pte_t pte)
+{
+	return __pgd(pte_val(pte));
+}
+
+extern struct page *pgd_page(pgd_t pgd);
+
 /*
  * Find an entry in a page-table-directory.  We combine the address region
  * (the high order N bits) and the pgd portion of the address.
@@ -118,9 +133,10 @@ static inline void pgd_set(pgd_t *pgdp, unsigned long val)
 
 #define pgd_offset(mm, address)	 ((mm)->pgd + pgd_index(address))
 
+#define pud_offset(pgdp, addr)	\
+	(((pud_t *) pgd_page_vaddr(*(pgdp))) + pud_index(addr))
 #define pmd_offset(pudp,addr) \
 	(((pmd_t *) pud_page_vaddr(*(pudp))) + pmd_index(addr))
-
 #define pte_offset_kernel(dir,addr) \
 	(((pte_t *) pmd_page_vaddr(*(dir))) + pte_index(addr))
 
@@ -135,6 +151,8 @@ static inline void pgd_set(pgd_t *pgdp, unsigned long val)
 	pr_err("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e))
 #define pmd_ERROR(e) \
 	pr_err("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e))
+#define pud_ERROR(e) \
+	pr_err("%s:%d: bad pud %08lx.\n", __FILE__, __LINE__, pud_val(e))
 #define pgd_ERROR(e) \
 	pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
 
@@ -154,10 +172,10 @@ static inline void pgd_set(pgd_t *pgdp, unsigned long val)
 #define SWP_TYPE_BITS 5
 #define __swp_type(x)		(((x).val >> _PAGE_BIT_SWAP_TYPE) \
 				& ((1UL << SWP_TYPE_BITS) - 1))
-#define __swp_offset(x)		((x).val >> PTE_RPN_SHIFT)
+#define __swp_offset(x)		(((x).val & PTE_RPN_MASK) >> PTE_RPN_SHIFT)
 #define __swp_entry(type, offset)	((swp_entry_t) { \
-					((type) << _PAGE_BIT_SWAP_TYPE) \
-					| ((offset) << PTE_RPN_SHIFT) })
+				((type) << _PAGE_BIT_SWAP_TYPE) \
+				| (((offset) << PTE_RPN_SHIFT) & PTE_RPN_MASK)})
 /*
  * swp_entry_t must be independent of pte bits. We build a swp_entry_t from
  * swap type and offset we get from swap and convert that to pte to find a
@@ -281,6 +299,10 @@ extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp);
 extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
 			    pmd_t *pmdp);
 
+#define __HAVE_ARCH_PMDP_HUGE_SPLIT_PREPARE
+extern void pmdp_huge_split_prepare(struct vm_area_struct *vma,
+				    unsigned long address, pmd_t *pmdp);
+
 #define pmd_move_must_withdraw pmd_move_must_withdraw
 struct spinlock;
 static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl,
diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
new file mode 100644
index 000000000000..1b753f96b374
--- /dev/null
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h
@@ -0,0 +1,94 @@
+#ifndef _ASM_POWERPC_BOOK3S_64_TLBFLUSH_HASH_H
+#define _ASM_POWERPC_BOOK3S_64_TLBFLUSH_HASH_H
+
+#define MMU_NO_CONTEXT		0
+
+/*
+ * TLB flushing for 64-bit hash-MMU CPUs
+ */
+
+#include <linux/percpu.h>
+#include <asm/page.h>
+
+#define PPC64_TLB_BATCH_NR 192
+
+struct ppc64_tlb_batch {
+	int			active;
+	unsigned long		index;
+	struct mm_struct	*mm;
+	real_pte_t		pte[PPC64_TLB_BATCH_NR];
+	unsigned long		vpn[PPC64_TLB_BATCH_NR];
+	unsigned int		psize;
+	int			ssize;
+};
+DECLARE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
+
+extern void __flush_tlb_pending(struct ppc64_tlb_batch *batch);
+
+#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
+
+static inline void arch_enter_lazy_mmu_mode(void)
+{
+	struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch);
+
+	batch->active = 1;
+}
+
+static inline void arch_leave_lazy_mmu_mode(void)
+{
+	struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch);
+
+	if (batch->index)
+		__flush_tlb_pending(batch);
+	batch->active = 0;
+}
+
+#define arch_flush_lazy_mmu_mode()      do {} while (0)
+
+
+extern void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize,
+			    int ssize, unsigned long flags);
+extern void flush_hash_range(unsigned long number, int local);
+extern void flush_hash_hugepage(unsigned long vsid, unsigned long addr,
+				pmd_t *pmdp, unsigned int psize, int ssize,
+				unsigned long flags);
+
+static inline void local_flush_tlb_mm(struct mm_struct *mm)
+{
+}
+
+static inline void flush_tlb_mm(struct mm_struct *mm)
+{
+}
+
+static inline void local_flush_tlb_page(struct vm_area_struct *vma,
+					unsigned long vmaddr)
+{
+}
+
+static inline void flush_tlb_page(struct vm_area_struct *vma,
+				  unsigned long vmaddr)
+{
+}
+
+static inline void flush_tlb_page_nohash(struct vm_area_struct *vma,
+					 unsigned long vmaddr)
+{
+}
+
+static inline void flush_tlb_range(struct vm_area_struct *vma,
+				   unsigned long start, unsigned long end)
+{
+}
+
+static inline void flush_tlb_kernel_range(unsigned long start,
+					  unsigned long end)
+{
+}
+
+/* Private function for use by PCI IO mapping code */
+extern void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
+				     unsigned long end);
+extern void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd,
+				unsigned long addr);
+#endif /*  _ASM_POWERPC_BOOK3S_64_TLBFLUSH_HASH_H */
diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
index d1a8d93cccfd..44efe739b6b9 100644
--- a/arch/powerpc/include/asm/cmpxchg.h
+++ b/arch/powerpc/include/asm/cmpxchg.h
@@ -5,25 +5,25 @@
 #include <linux/compiler.h>
 #include <asm/synch.h>
 #include <asm/asm-compat.h>
+#include <linux/bug.h>
 
 /*
  * Atomic exchange
  *
- * Changes the memory location '*ptr' to be val and returns
+ * Changes the memory location '*p' to be val and returns
  * the previous value stored there.
  */
+
 static __always_inline unsigned long
-__xchg_u32(volatile void *p, unsigned long val)
+__xchg_u32_local(volatile void *p, unsigned long val)
 {
 	unsigned long prev;
 
 	__asm__ __volatile__(
-	PPC_ATOMIC_ENTRY_BARRIER
 "1:	lwarx	%0,0,%2 \n"
 	PPC405_ERR77(0,%2)
 "	stwcx.	%3,0,%2 \n\
 	bne-	1b"
-	PPC_ATOMIC_EXIT_BARRIER
 	: "=&r" (prev), "+m" (*(volatile unsigned int *)p)
 	: "r" (p), "r" (val)
 	: "cc", "memory");
@@ -31,42 +31,34 @@ __xchg_u32(volatile void *p, unsigned long val)
 	return prev;
 }
 
-/*
- * Atomic exchange
- *
- * Changes the memory location '*ptr' to be val and returns
- * the previous value stored there.
- */
 static __always_inline unsigned long
-__xchg_u32_local(volatile void *p, unsigned long val)
+__xchg_u32_relaxed(u32 *p, unsigned long val)
 {
 	unsigned long prev;
 
 	__asm__ __volatile__(
-"1:	lwarx	%0,0,%2 \n"
-	PPC405_ERR77(0,%2)
-"	stwcx.	%3,0,%2 \n\
-	bne-	1b"
-	: "=&r" (prev), "+m" (*(volatile unsigned int *)p)
+"1:	lwarx	%0,0,%2\n"
+	PPC405_ERR77(0, %2)
+"	stwcx.	%3,0,%2\n"
+"	bne-	1b"
+	: "=&r" (prev), "+m" (*p)
 	: "r" (p), "r" (val)
-	: "cc", "memory");
+	: "cc");
 
 	return prev;
 }
 
 #ifdef CONFIG_PPC64
 static __always_inline unsigned long
-__xchg_u64(volatile void *p, unsigned long val)
+__xchg_u64_local(volatile void *p, unsigned long val)
 {
 	unsigned long prev;
 
 	__asm__ __volatile__(
-	PPC_ATOMIC_ENTRY_BARRIER
 "1:	ldarx	%0,0,%2 \n"
 	PPC405_ERR77(0,%2)
 "	stdcx.	%3,0,%2 \n\
 	bne-	1b"
-	PPC_ATOMIC_EXIT_BARRIER
 	: "=&r" (prev), "+m" (*(volatile unsigned long *)p)
 	: "r" (p), "r" (val)
 	: "cc", "memory");
@@ -75,64 +67,52 @@ __xchg_u64(volatile void *p, unsigned long val)
 }
 
 static __always_inline unsigned long
-__xchg_u64_local(volatile void *p, unsigned long val)
+__xchg_u64_relaxed(u64 *p, unsigned long val)
 {
 	unsigned long prev;
 
 	__asm__ __volatile__(
-"1:	ldarx	%0,0,%2 \n"
-	PPC405_ERR77(0,%2)
-"	stdcx.	%3,0,%2 \n\
-	bne-	1b"
-	: "=&r" (prev), "+m" (*(volatile unsigned long *)p)
+"1:	ldarx	%0,0,%2\n"
+	PPC405_ERR77(0, %2)
+"	stdcx.	%3,0,%2\n"
+"	bne-	1b"
+	: "=&r" (prev), "+m" (*p)
 	: "r" (p), "r" (val)
-	: "cc", "memory");
+	: "cc");
 
 	return prev;
 }
 #endif
 
-/*
- * This function doesn't exist, so you'll get a linker error
- * if something tries to do an invalid xchg().
- */
-extern void __xchg_called_with_bad_pointer(void);
-
 static __always_inline unsigned long
-__xchg(volatile void *ptr, unsigned long x, unsigned int size)
+__xchg_local(volatile void *ptr, unsigned long x, unsigned int size)
 {
 	switch (size) {
 	case 4:
-		return __xchg_u32(ptr, x);
+		return __xchg_u32_local(ptr, x);
 #ifdef CONFIG_PPC64
 	case 8:
-		return __xchg_u64(ptr, x);
+		return __xchg_u64_local(ptr, x);
 #endif
 	}
-	__xchg_called_with_bad_pointer();
+	BUILD_BUG_ON_MSG(1, "Unsupported size for __xchg");
 	return x;
 }
 
 static __always_inline unsigned long
-__xchg_local(volatile void *ptr, unsigned long x, unsigned int size)
+__xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
 {
 	switch (size) {
 	case 4:
-		return __xchg_u32_local(ptr, x);
+		return __xchg_u32_relaxed(ptr, x);
 #ifdef CONFIG_PPC64
 	case 8:
-		return __xchg_u64_local(ptr, x);
+		return __xchg_u64_relaxed(ptr, x);
 #endif
 	}
-	__xchg_called_with_bad_pointer();
+	BUILD_BUG_ON_MSG(1, "Unsupported size for __xchg_local");
 	return x;
 }
-#define xchg(ptr,x)							     \
-  ({									     \
-     __typeof__(*(ptr)) _x_ = (x);					     \
-     (__typeof__(*(ptr))) __xchg((ptr), (unsigned long)_x_, sizeof(*(ptr))); \
-  })
-
 #define xchg_local(ptr,x)						     \
   ({									     \
      __typeof__(*(ptr)) _x_ = (x);					     \
@@ -140,6 +120,12 @@ __xchg_local(volatile void *ptr, unsigned long x, unsigned int size)
      		(unsigned long)_x_, sizeof(*(ptr))); 			     \
   })
 
+#define xchg_relaxed(ptr, x)						\
+({									\
+	__typeof__(*(ptr)) _x_ = (x);					\
+	(__typeof__(*(ptr))) __xchg_relaxed((ptr),			\
+			(unsigned long)_x_, sizeof(*(ptr)));		\
+})
 /*
  * Compare and exchange - if *p == old, set it to new,
  * and return the old value of *p.
@@ -190,6 +176,56 @@ __cmpxchg_u32_local(volatile unsigned int *p, unsigned long old,
 	return prev;
 }
 
+static __always_inline unsigned long
+__cmpxchg_u32_relaxed(u32 *p, unsigned long old, unsigned long new)
+{
+	unsigned long prev;
+
+	__asm__ __volatile__ (
+"1:	lwarx	%0,0,%2		# __cmpxchg_u32_relaxed\n"
+"	cmpw	0,%0,%3\n"
+"	bne-	2f\n"
+	PPC405_ERR77(0, %2)
+"	stwcx.	%4,0,%2\n"
+"	bne-	1b\n"
+"2:"
+	: "=&r" (prev), "+m" (*p)
+	: "r" (p), "r" (old), "r" (new)
+	: "cc");
+
+	return prev;
+}
+
+/*
+ * cmpxchg family don't have order guarantee if cmp part fails, therefore we
+ * can avoid superfluous barriers if we use assembly code to implement
+ * cmpxchg() and cmpxchg_acquire(), however we don't do the similar for
+ * cmpxchg_release() because that will result in putting a barrier in the
+ * middle of a ll/sc loop, which is probably a bad idea. For example, this
+ * might cause the conditional store more likely to fail.
+ */
+static __always_inline unsigned long
+__cmpxchg_u32_acquire(u32 *p, unsigned long old, unsigned long new)
+{
+	unsigned long prev;
+
+	__asm__ __volatile__ (
+"1:	lwarx	%0,0,%2		# __cmpxchg_u32_acquire\n"
+"	cmpw	0,%0,%3\n"
+"	bne-	2f\n"
+	PPC405_ERR77(0, %2)
+"	stwcx.	%4,0,%2\n"
+"	bne-	1b\n"
+	PPC_ACQUIRE_BARRIER
+	"\n"
+"2:"
+	: "=&r" (prev), "+m" (*p)
+	: "r" (p), "r" (old), "r" (new)
+	: "cc", "memory");
+
+	return prev;
+}
+
 #ifdef CONFIG_PPC64
 static __always_inline unsigned long
 __cmpxchg_u64(volatile unsigned long *p, unsigned long old, unsigned long new)
@@ -233,11 +269,47 @@ __cmpxchg_u64_local(volatile unsigned long *p, unsigned long old,
 
 	return prev;
 }
-#endif
 
-/* This function doesn't exist, so you'll get a linker error
-   if something tries to do an invalid cmpxchg().  */
-extern void __cmpxchg_called_with_bad_pointer(void);
+static __always_inline unsigned long
+__cmpxchg_u64_relaxed(u64 *p, unsigned long old, unsigned long new)
+{
+	unsigned long prev;
+
+	__asm__ __volatile__ (
+"1:	ldarx	%0,0,%2		# __cmpxchg_u64_relaxed\n"
+"	cmpd	0,%0,%3\n"
+"	bne-	2f\n"
+"	stdcx.	%4,0,%2\n"
+"	bne-	1b\n"
+"2:"
+	: "=&r" (prev), "+m" (*p)
+	: "r" (p), "r" (old), "r" (new)
+	: "cc");
+
+	return prev;
+}
+
+static __always_inline unsigned long
+__cmpxchg_u64_acquire(u64 *p, unsigned long old, unsigned long new)
+{
+	unsigned long prev;
+
+	__asm__ __volatile__ (
+"1:	ldarx	%0,0,%2		# __cmpxchg_u64_acquire\n"
+"	cmpd	0,%0,%3\n"
+"	bne-	2f\n"
+"	stdcx.	%4,0,%2\n"
+"	bne-	1b\n"
+	PPC_ACQUIRE_BARRIER
+	"\n"
+"2:"
+	: "=&r" (prev), "+m" (*p)
+	: "r" (p), "r" (old), "r" (new)
+	: "cc", "memory");
+
+	return prev;
+}
+#endif
 
 static __always_inline unsigned long
 __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
@@ -251,7 +323,7 @@ __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new,
 		return __cmpxchg_u64(ptr, old, new);
 #endif
 	}
-	__cmpxchg_called_with_bad_pointer();
+	BUILD_BUG_ON_MSG(1, "Unsupported size for __cmpxchg");
 	return old;
 }
 
@@ -267,10 +339,41 @@ __cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new,
 		return __cmpxchg_u64_local(ptr, old, new);
 #endif
 	}
-	__cmpxchg_called_with_bad_pointer();
+	BUILD_BUG_ON_MSG(1, "Unsupported size for __cmpxchg_local");
+	return old;
+}
+
+static __always_inline unsigned long
+__cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new,
+		  unsigned int size)
+{
+	switch (size) {
+	case 4:
+		return __cmpxchg_u32_relaxed(ptr, old, new);
+#ifdef CONFIG_PPC64
+	case 8:
+		return __cmpxchg_u64_relaxed(ptr, old, new);
+#endif
+	}
+	BUILD_BUG_ON_MSG(1, "Unsupported size for __cmpxchg_relaxed");
 	return old;
 }
 
+static __always_inline unsigned long
+__cmpxchg_acquire(void *ptr, unsigned long old, unsigned long new,
+		  unsigned int size)
+{
+	switch (size) {
+	case 4:
+		return __cmpxchg_u32_acquire(ptr, old, new);
+#ifdef CONFIG_PPC64
+	case 8:
+		return __cmpxchg_u64_acquire(ptr, old, new);
+#endif
+	}
+	BUILD_BUG_ON_MSG(1, "Unsupported size for __cmpxchg_acquire");
+	return old;
+}
 #define cmpxchg(ptr, o, n)						 \
   ({									 \
      __typeof__(*(ptr)) _o_ = (o);					 \
@@ -288,6 +391,23 @@ __cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new,
 				    (unsigned long)_n_, sizeof(*(ptr))); \
   })
 
+#define cmpxchg_relaxed(ptr, o, n)					\
+({									\
+	__typeof__(*(ptr)) _o_ = (o);					\
+	__typeof__(*(ptr)) _n_ = (n);					\
+	(__typeof__(*(ptr))) __cmpxchg_relaxed((ptr),			\
+			(unsigned long)_o_, (unsigned long)_n_,		\
+			sizeof(*(ptr)));				\
+})
+
+#define cmpxchg_acquire(ptr, o, n)					\
+({									\
+	__typeof__(*(ptr)) _o_ = (o);					\
+	__typeof__(*(ptr)) _n_ = (n);					\
+	(__typeof__(*(ptr))) __cmpxchg_acquire((ptr),			\
+			(unsigned long)_o_, (unsigned long)_n_,		\
+			sizeof(*(ptr)));				\
+})
 #ifdef CONFIG_PPC64
 #define cmpxchg64(ptr, o, n)						\
   ({									\
@@ -299,7 +419,16 @@ __cmpxchg_local(volatile void *ptr, unsigned long old, unsigned long new,
 	BUILD_BUG_ON(sizeof(*(ptr)) != 8);				\
 	cmpxchg_local((ptr), (o), (n));					\
   })
-#define cmpxchg64_relaxed	cmpxchg64_local
+#define cmpxchg64_relaxed(ptr, o, n)					\
+({									\
+	BUILD_BUG_ON(sizeof(*(ptr)) != 8);				\
+	cmpxchg_relaxed((ptr), (o), (n));				\
+})
+#define cmpxchg64_acquire(ptr, o, n)					\
+({									\
+	BUILD_BUG_ON(sizeof(*(ptr)) != 8);				\
+	cmpxchg_acquire((ptr), (o), (n));				\
+})
 #else
 #include <asm-generic/cmpxchg-local.h>
 #define cmpxchg64_local(ptr, o, n) __cmpxchg64_local_generic((ptr), (o), (n))
diff --git a/arch/powerpc/include/asm/cputable.h b/arch/powerpc/include/asm/cputable.h
index b118072670fb..94ace9b4c4e1 100644
--- a/arch/powerpc/include/asm/cputable.h
+++ b/arch/powerpc/include/asm/cputable.h
@@ -171,7 +171,7 @@ enum {
 #define CPU_FTR_ARCH_201		LONG_ASM_CONST(0x0000000200000000)
 #define CPU_FTR_ARCH_206		LONG_ASM_CONST(0x0000000400000000)
 #define CPU_FTR_ARCH_207S		LONG_ASM_CONST(0x0000000800000000)
-/* Free					LONG_ASM_CONST(0x0000001000000000) */
+#define CPU_FTR_ARCH_300		LONG_ASM_CONST(0x0000001000000000)
 #define CPU_FTR_MMCRA			LONG_ASM_CONST(0x0000002000000000)
 #define CPU_FTR_CTRL			LONG_ASM_CONST(0x0000004000000000)
 #define CPU_FTR_SMT			LONG_ASM_CONST(0x0000008000000000)
@@ -196,6 +196,7 @@ enum {
 #define CPU_FTR_DAWR			LONG_ASM_CONST(0x0400000000000000)
 #define CPU_FTR_DABRX			LONG_ASM_CONST(0x0800000000000000)
 #define CPU_FTR_PMAO_BUG		LONG_ASM_CONST(0x1000000000000000)
+#define CPU_FTR_SUBCORE			LONG_ASM_CONST(0x2000000000000000)
 
 #ifndef __ASSEMBLY__
 
@@ -443,9 +444,19 @@ enum {
 	    CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
 	    CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \
 	    CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_DAWR | \
-	    CPU_FTR_ARCH_207S | CPU_FTR_TM_COMP)
+	    CPU_FTR_ARCH_207S | CPU_FTR_TM_COMP | CPU_FTR_SUBCORE)
 #define CPU_FTRS_POWER8E (CPU_FTRS_POWER8 | CPU_FTR_PMAO_BUG)
 #define CPU_FTRS_POWER8_DD1 (CPU_FTRS_POWER8 & ~CPU_FTR_DBELL)
+#define CPU_FTRS_POWER9 (CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
+	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | CPU_FTR_ARCH_206 |\
+	    CPU_FTR_MMCRA | CPU_FTR_SMT | \
+	    CPU_FTR_COHERENT_ICACHE | \
+	    CPU_FTR_PURR | CPU_FTR_SPURR | CPU_FTR_REAL_LE | \
+	    CPU_FTR_DSCR | CPU_FTR_SAO  | \
+	    CPU_FTR_STCX_CHECKS_ADDRESS | CPU_FTR_POPCNTB | CPU_FTR_POPCNTD | \
+	    CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \
+	    CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_DAWR | \
+	    CPU_FTR_ARCH_207S | CPU_FTR_TM_COMP | CPU_FTR_ARCH_300)
 #define CPU_FTRS_CELL	(CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
 	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
 	    CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \
@@ -464,7 +475,7 @@ enum {
 	    (CPU_FTRS_POWER4 | CPU_FTRS_PPC970 | CPU_FTRS_POWER5 | \
 	     CPU_FTRS_POWER6 | CPU_FTRS_POWER7 | CPU_FTRS_POWER8E | \
 	     CPU_FTRS_POWER8 | CPU_FTRS_POWER8_DD1 | CPU_FTRS_CELL | \
-	     CPU_FTRS_PA6T | CPU_FTR_VSX)
+	     CPU_FTRS_PA6T | CPU_FTR_VSX | CPU_FTRS_POWER9)
 #endif
 #else
 enum {
@@ -515,7 +526,8 @@ enum {
 	    (CPU_FTRS_POWER4 & CPU_FTRS_PPC970 & CPU_FTRS_POWER5 & \
 	     CPU_FTRS_POWER6 & CPU_FTRS_POWER7 & CPU_FTRS_CELL & \
 	     CPU_FTRS_PA6T & CPU_FTRS_POWER8 & CPU_FTRS_POWER8E & \
-	     CPU_FTRS_POWER8_DD1 & ~CPU_FTR_HVMODE & CPU_FTRS_POSSIBLE)
+	     CPU_FTRS_POWER8_DD1 & ~CPU_FTR_HVMODE & CPU_FTRS_POSSIBLE & \
+	     CPU_FTRS_POWER9)
 #endif
 #else
 enum {
diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index c5eb86f3d452..fb9f376ae27b 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -72,6 +72,7 @@ struct pci_dn;
 #define EEH_PE_PHB	(1 << 1)	/* PHB PE    */
 #define EEH_PE_DEVICE 	(1 << 2)	/* Device PE */
 #define EEH_PE_BUS	(1 << 3)	/* Bus PE    */
+#define EEH_PE_VF	(1 << 4)	/* VF PE     */
 
 #define EEH_PE_ISOLATED		(1 << 0)	/* Isolated PE		*/
 #define EEH_PE_RECOVERING	(1 << 1)	/* Recovering PE	*/
@@ -81,6 +82,7 @@ struct pci_dn;
 #define EEH_PE_KEEP		(1 << 8)	/* Keep PE on hotplug	*/
 #define EEH_PE_CFG_RESTRICTED	(1 << 9)	/* Block config on error */
 #define EEH_PE_REMOVED		(1 << 10)	/* Removed permanently	*/
+#define EEH_PE_PRI_BUS		(1 << 11)	/* Cached primary bus   */
 
 struct eeh_pe {
 	int type;			/* PE type: PHB/Bus/Device	*/
@@ -135,11 +137,15 @@ struct eeh_dev {
 	int pcix_cap;			/* Saved PCIx capability	*/
 	int pcie_cap;			/* Saved PCIe capability	*/
 	int aer_cap;			/* Saved AER capability		*/
+	int af_cap;			/* Saved AF capability		*/
 	struct eeh_pe *pe;		/* Associated PE		*/
 	struct list_head list;		/* Form link list in the PE	*/
+	struct list_head rmv_list;	/* Record the removed edevs	*/
 	struct pci_controller *phb;	/* Associated PHB		*/
 	struct pci_dn *pdn;		/* Associated PCI device node	*/
 	struct pci_dev *pdev;		/* Associated PCI device	*/
+	bool in_error;			/* Error flag for edev		*/
+	struct pci_dev *physfn;		/* Associated SRIOV PF		*/
 	struct pci_bus *bus;		/* PCI bus for partial hotplug	*/
 };
 
diff --git a/arch/powerpc/include/asm/hugetlb.h b/arch/powerpc/include/asm/hugetlb.h
index 7eac89b9f02e..42814f0567cc 100644
--- a/arch/powerpc/include/asm/hugetlb.h
+++ b/arch/powerpc/include/asm/hugetlb.h
@@ -19,7 +19,7 @@ static inline pte_t *hugepd_page(hugepd_t hpd)
 	 * We have only four bits to encode, MMU page size
 	 */
 	BUILD_BUG_ON((MMU_PAGE_COUNT - 1) > 0xf);
-	return (pte_t *)(hpd.pd & ~HUGEPD_SHIFT_MASK);
+	return __va(hpd.pd & HUGEPD_ADDR_MASK);
 }
 
 static inline unsigned int hugepd_mmu_psize(hugepd_t hpd)
diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index e3b54dd4f730..0bc9c284aa10 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -94,6 +94,7 @@
 #define H_SG_LIST	-72
 #define H_OP_MODE	-73
 #define H_COP_HW	-74
+#define H_STATE		-75
 #define H_UNSUPPORTED_FLAG_START	-256
 #define H_UNSUPPORTED_FLAG_END		-511
 #define H_MULTI_THREADS_ACTIVE	-9005
diff --git a/arch/powerpc/include/asm/hydra.h b/arch/powerpc/include/asm/hydra.h
index 1cb39c96d155..b3b0f2d020f0 100644
--- a/arch/powerpc/include/asm/hydra.h
+++ b/arch/powerpc/include/asm/hydra.h
@@ -89,7 +89,7 @@ extern volatile struct Hydra __iomem *Hydra;
 #define HYDRA_INT_EXT2		13	/* PCI IRQX */
 #define HYDRA_INT_EXT3		14	/* PCI IRQY */
 #define HYDRA_INT_EXT4		15	/* PCI IRQZ */
-#define HYDRA_INT_EXT5		16	/* IDE Primay/Secondary */
+#define HYDRA_INT_EXT5		16	/* IDE Primary/Secondary */
 #define HYDRA_INT_EXT6		17	/* IDE Secondary */
 #define HYDRA_INT_EXT7		18	/* Power Off Request */
 #define HYDRA_INT_SPARE		19
diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index 6c1297ec374c..2fd1690b79d2 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -300,7 +300,7 @@ extern void _memcpy_toio(volatile void __iomem *dest, const void *src,
  * When CONFIG_PPC_INDIRECT_MMIO is set, the platform can provide hooks
  * on all MMIOs. (Note that this is all 64 bits only for now)
  *
- * To help platforms who may need to differenciate MMIO addresses in
+ * To help platforms who may need to differentiate MMIO addresses in
  * their hooks, a bitfield is reserved for use by the platform near the
  * top of MMIO addresses (not PIO, those have to cope the hard way).
  *
diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h
index 3f191f573d4f..fd22442d30a9 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -54,7 +54,7 @@ struct machdep_calls {
 				       int psize, int apsize,
 				       int ssize);
 	long		(*hpte_remove)(unsigned long hpte_group);
-	void            (*hpte_removebolted)(unsigned long ea,
+	int             (*hpte_removebolted)(unsigned long ea,
 					     int psize, int ssize);
 	void		(*flush_hash_range)(unsigned long number, int local);
 	void		(*hugepage_invalidate)(unsigned long vsid,
@@ -174,11 +174,11 @@ struct machdep_calls {
 	   platform, called once per cpu. */
 	void		(*enable_pmcs)(void);
 
-	/* Set DABR for this platform, leave empty for default implemenation */
+	/* Set DABR for this platform, leave empty for default implementation */
 	int		(*set_dabr)(unsigned long dabr,
 				    unsigned long dabrx);
 
-	/* Set DAWR for this platform, leave empty for default implemenation */
+	/* Set DAWR for this platform, leave empty for default implementation */
 	int		(*set_dawr)(unsigned long dawr,
 				    unsigned long dawrx);
 
diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
index 3d5abfe6ba67..8ca1c983bf6c 100644
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -97,6 +97,7 @@
 #define MMU_FTRS_POWER6		MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE
 #define MMU_FTRS_POWER7		MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE
 #define MMU_FTRS_POWER8		MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE
+#define MMU_FTRS_POWER9		MMU_FTRS_POWER4 | MMU_FTR_LOCKLESS_TLBIE
 #define MMU_FTRS_CELL		MMU_FTRS_DEFAULT_HPTE_ARCH_V2 | \
 				MMU_FTR_CI_LARGE_PAGE
 #define MMU_FTRS_PA6T		MMU_FTRS_DEFAULT_HPTE_ARCH_V2 | \
@@ -182,10 +183,10 @@ static inline void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
 
 #if defined(CONFIG_PPC_STD_MMU_64)
 /* 64-bit classic hash table MMU */
-#  include <asm/mmu-hash64.h>
+#include <asm/book3s/64/mmu-hash.h>
 #elif defined(CONFIG_PPC_STD_MMU_32)
 /* 32-bit classic hash table MMU */
-#  include <asm/mmu-hash32.h>
+#include <asm/book3s/32/mmu-hash.h>
 #elif defined(CONFIG_40x)
 /* 40x-style software loaded TLB */
 #  include <asm/mmu-40x.h>
diff --git a/arch/powerpc/include/asm/module.h b/arch/powerpc/include/asm/module.h
index 5b6b5a427b54..cd4ffd86765f 100644
--- a/arch/powerpc/include/asm/module.h
+++ b/arch/powerpc/include/asm/module.h
@@ -19,7 +19,7 @@
  * Thanks to Paul M for explaining this.
  *
  * PPC can only do rel jumps += 32MB, and often the kernel and other
- * modules are furthur away than this.  So, we jump to a table of
+ * modules are further away than this.  So, we jump to a table of
  * trampolines attached to the module (the Procedure Linkage Table)
  * whenever that happens.
  */
diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h
index b9f734dd5b81..10debb93c4a4 100644
--- a/arch/powerpc/include/asm/nohash/64/pgtable.h
+++ b/arch/powerpc/include/asm/nohash/64/pgtable.h
@@ -108,6 +108,9 @@
 #ifndef __ASSEMBLY__
 /* pte_clear moved to later in this file */
 
+/* Pointers in the page table tree are virtual addresses */
+#define __pgtable_ptr_val(ptr)	((unsigned long)(ptr))
+
 #define PMD_BAD_BITS		(PTE_TABLE_SIZE-1)
 #define PUD_BAD_BITS		(PMD_TABLE_SIZE-1)
 
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index 07a99e638449..9d86c6651716 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -248,6 +248,7 @@ extern int opal_elog_init(void);
 extern void opal_platform_dump_init(void);
 extern void opal_sys_param_init(void);
 extern void opal_msglog_init(void);
+extern void opal_msglog_sysfs_init(void);
 extern int opal_async_comp_init(void);
 extern int opal_sensor_init(void);
 extern int opal_hmi_handler_init(void);
@@ -273,6 +274,8 @@ void opal_free_sg_list(struct opal_sg_list *sg);
 
 extern int opal_error_code(int rc);
 
+ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count);
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_OPAL_H */
diff --git a/arch/powerpc/include/asm/page.h b/arch/powerpc/include/asm/page.h
index e34124f6fbf2..ab3d8977bacd 100644
--- a/arch/powerpc/include/asm/page.h
+++ b/arch/powerpc/include/asm/page.h
@@ -271,6 +271,13 @@ extern long long virt_phys_offset;
 #else
 #define PD_HUGE 0x80000000
 #endif
+
+#else	/* CONFIG_PPC_BOOK3S_64 */
+/*
+ * Book3S 64 stores real addresses in the hugepd entries to
+ * avoid overlaps with _PAGE_PRESENT and _PAGE_PTE.
+ */
+#define HUGEPD_ADDR_MASK	(0x0ffffffffffffffful & ~HUGEPD_SHIFT_MASK)
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
 /*
@@ -281,109 +288,7 @@ extern long long virt_phys_offset;
 
 #ifndef __ASSEMBLY__
 
-#ifdef CONFIG_STRICT_MM_TYPECHECKS
-/* These are used to make use of C type-checking. */
-
-/* PTE level */
-typedef struct { pte_basic_t pte; } pte_t;
-#define __pte(x)	((pte_t) { (x) })
-static inline pte_basic_t pte_val(pte_t x)
-{
-	return x.pte;
-}
-
-/* 64k pages additionally define a bigger "real PTE" type that gathers
- * the "second half" part of the PTE for pseudo 64k pages
- */
-#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_STD_MMU_64)
-typedef struct { pte_t pte; unsigned long hidx; } real_pte_t;
-#else
-typedef struct { pte_t pte; } real_pte_t;
-#endif
-
-/* PMD level */
-#ifdef CONFIG_PPC64
-typedef struct { unsigned long pmd; } pmd_t;
-#define __pmd(x)	((pmd_t) { (x) })
-static inline unsigned long pmd_val(pmd_t x)
-{
-	return x.pmd;
-}
-
-/* PUD level exusts only on 4k pages */
-#ifndef CONFIG_PPC_64K_PAGES
-typedef struct { unsigned long pud; } pud_t;
-#define __pud(x)	((pud_t) { (x) })
-static inline unsigned long pud_val(pud_t x)
-{
-	return x.pud;
-}
-#endif /* !CONFIG_PPC_64K_PAGES */
-#endif /* CONFIG_PPC64 */
-
-/* PGD level */
-typedef struct { unsigned long pgd; } pgd_t;
-#define __pgd(x)	((pgd_t) { (x) })
-static inline unsigned long pgd_val(pgd_t x)
-{
-	return x.pgd;
-}
-
-/* Page protection bits */
-typedef struct { unsigned long pgprot; } pgprot_t;
-#define pgprot_val(x)	((x).pgprot)
-#define __pgprot(x)	((pgprot_t) { (x) })
-
-#else
-
-/*
- * .. while these make it easier on the compiler
- */
-
-typedef pte_basic_t pte_t;
-#define __pte(x)	(x)
-static inline pte_basic_t pte_val(pte_t pte)
-{
-	return pte;
-}
-
-#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_STD_MMU_64)
-typedef struct { pte_t pte; unsigned long hidx; } real_pte_t;
-#else
-typedef pte_t real_pte_t;
-#endif
-
-
-#ifdef CONFIG_PPC64
-typedef unsigned long pmd_t;
-#define __pmd(x)	(x)
-static inline unsigned long pmd_val(pmd_t pmd)
-{
-	return pmd;
-}
-
-#ifndef CONFIG_PPC_64K_PAGES
-typedef unsigned long pud_t;
-#define __pud(x)	(x)
-static inline unsigned long pud_val(pud_t pud)
-{
-	return pud;
-}
-#endif /* !CONFIG_PPC_64K_PAGES */
-#endif /* CONFIG_PPC64 */
-
-typedef unsigned long pgd_t;
-#define __pgd(x)	(x)
-static inline unsigned long pgd_val(pgd_t pgd)
-{
-	return pgd;
-}
-
-typedef unsigned long pgprot_t;
-#define pgprot_val(x)	(x)
-#define __pgprot(x)	(x)
-
-#endif
+#include <asm/pgtable-types.h>
 
 typedef struct { signed long pd; } hugepd_t;
 
diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h
index 54843ca5fa2b..9f165e8a77bf 100644
--- a/arch/powerpc/include/asm/pci-bridge.h
+++ b/arch/powerpc/include/asm/pci-bridge.h
@@ -212,15 +212,16 @@ struct pci_dn {
 #define IODA_INVALID_PE		(-1)
 #ifdef CONFIG_PPC_POWERNV
 	int	pe_number;
+	int     vf_index;		/* VF index in the PF */
 #ifdef CONFIG_PCI_IOV
 	u16     vfs_expanded;		/* number of VFs IOV BAR expanded */
 	u16     num_vfs;		/* number of VFs enabled*/
-	int     offset;			/* PE# for the first VF PE */
-#define M64_PER_IOV 4
-	int     m64_per_iov;
+	int     *pe_num_map;		/* PE# for the first VF PE or array */
+	bool    m64_single_mode;	/* Use M64 BAR in Single Mode */
 #define IODA_INVALID_M64        (-1)
-	int     m64_wins[PCI_SRIOV_NUM_BARS][M64_PER_IOV];
+	int     (*m64_map)[PCI_SRIOV_NUM_BARS];
 #endif /* CONFIG_PCI_IOV */
+	int	mps;			/* Maximum Payload Size */
 #endif
 	struct list_head child_list;
 	struct list_head list;
diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h
index 814622146d5a..e157489ee7a1 100644
--- a/arch/powerpc/include/asm/perf_event_server.h
+++ b/arch/powerpc/include/asm/perf_event_server.h
@@ -136,16 +136,24 @@ extern ssize_t power_events_sysfs_show(struct device *dev,
  * event 'cpu-cycles' can have two entries in sysfs: 'cpu-cycles' and
  * 'PM_CYC' where the latter is the name by which the event is known in
  * POWER CPU specification.
+ *
+ * Similarly, some hardware and cache events use the same event code. Eg.
+ * on POWER8, both "cache-references" and "L1-dcache-loads" events refer
+ * to the same event, PM_LD_REF_L1.  The suffix, allows us to have two
+ * sysfs objects for the same event and thus two entries/aliases in sysfs.
  */
 #define	EVENT_VAR(_id, _suffix)		event_attr_##_id##_suffix
 #define	EVENT_PTR(_id, _suffix)		&EVENT_VAR(_id, _suffix).attr.attr
 
 #define	EVENT_ATTR(_name, _id, _suffix)					\
-	PMU_EVENT_ATTR(_name, EVENT_VAR(_id, _suffix), PME_##_id,	\
+	PMU_EVENT_ATTR(_name, EVENT_VAR(_id, _suffix), _id,		\
 			power_events_sysfs_show)
 
 #define	GENERIC_EVENT_ATTR(_name, _id)	EVENT_ATTR(_name, _id, _g)
 #define	GENERIC_EVENT_PTR(_id)		EVENT_PTR(_id, _g)
 
+#define	CACHE_EVENT_ATTR(_name, _id)	EVENT_ATTR(_name, _id, _c)
+#define	CACHE_EVENT_PTR(_id)		EVENT_PTR(_id, _c)
+
 #define	POWER_EVENT_ATTR(_name, _id)	EVENT_ATTR(_name, _id, _p)
 #define	POWER_EVENT_PTR(_id)		EVENT_PTR(_id, _p)
diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/pgalloc-64.h
index 69ef28a81733..8d5fc3ac43da 100644
--- a/arch/powerpc/include/asm/pgalloc-64.h
+++ b/arch/powerpc/include/asm/pgalloc-64.h
@@ -53,7 +53,7 @@ static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 
 #ifndef CONFIG_PPC_64K_PAGES
 
-#define pgd_populate(MM, PGD, PUD)	pgd_set(PGD, (unsigned long)PUD)
+#define pgd_populate(MM, PGD, PUD)	pgd_set(PGD, __pgtable_ptr_val(PUD))
 
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
@@ -68,19 +68,19 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud)
 
 static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
 {
-	pud_set(pud, (unsigned long)pmd);
+	pud_set(pud, __pgtable_ptr_val(pmd));
 }
 
 static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
 				       pte_t *pte)
 {
-	pmd_set(pmd, (unsigned long)pte);
+	pmd_set(pmd, __pgtable_ptr_val(pte));
 }
 
 static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
 				pgtable_t pte_page)
 {
-	pmd_set(pmd, (unsigned long)page_address(pte_page));
+	pmd_set(pmd, __pgtable_ptr_val(page_address(pte_page)));
 }
 
 #define pmd_pgtable(pmd) pmd_page(pmd)
@@ -171,23 +171,45 @@ extern void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift);
 extern void __tlb_remove_table(void *_table);
 #endif
 
-#define pud_populate(mm, pud, pmd)	pud_set(pud, (unsigned long)pmd)
+#ifndef __PAGETABLE_PUD_FOLDED
+/* book3s 64 is 4 level page table */
+static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
+{
+	pgd_set(pgd, __pgtable_ptr_val(pud));
+}
+
+static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
+{
+	return kmem_cache_alloc(PGT_CACHE(PUD_INDEX_SIZE),
+				GFP_KERNEL|__GFP_REPEAT);
+}
+
+static inline void pud_free(struct mm_struct *mm, pud_t *pud)
+{
+	kmem_cache_free(PGT_CACHE(PUD_INDEX_SIZE), pud);
+}
+#endif
+
+static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
+{
+	pud_set(pud, __pgtable_ptr_val(pmd));
+}
 
 static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd,
 				       pte_t *pte)
 {
-	pmd_set(pmd, (unsigned long)pte);
+	pmd_set(pmd, __pgtable_ptr_val(pte));
 }
 
 static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
 				pgtable_t pte_page)
 {
-	pmd_set(pmd, (unsigned long)pte_page);
+	pmd_set(pmd, __pgtable_ptr_val(pte_page));
 }
 
 static inline pgtable_t pmd_pgtable(pmd_t pmd)
 {
-	return (pgtable_t)(pmd_val(pmd) & ~PMD_MASKED_BITS);
+	return (pgtable_t)pmd_page_vaddr(pmd);
 }
 
 static inline pte_t *pte_alloc_one_kernel(struct mm_struct *mm,
@@ -233,11 +255,11 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
 
 #define __pmd_free_tlb(tlb, pmd, addr)		      \
 	pgtable_free_tlb(tlb, pmd, PMD_CACHE_INDEX)
-#ifndef CONFIG_PPC_64K_PAGES
+#ifndef __PAGETABLE_PUD_FOLDED
 #define __pud_free_tlb(tlb, pud, addr)		      \
 	pgtable_free_tlb(tlb, pud, PUD_INDEX_SIZE)
 
-#endif /* CONFIG_PPC_64K_PAGES */
+#endif /* __PAGETABLE_PUD_FOLDED */
 
 #define check_pgt_cache()	do { } while (0)
 
diff --git a/arch/powerpc/include/asm/pgtable-types.h b/arch/powerpc/include/asm/pgtable-types.h
new file mode 100644
index 000000000000..43140f8b0592
--- /dev/null
+++ b/arch/powerpc/include/asm/pgtable-types.h
@@ -0,0 +1,103 @@
+#ifndef _ASM_POWERPC_PGTABLE_TYPES_H
+#define _ASM_POWERPC_PGTABLE_TYPES_H
+
+#ifdef CONFIG_STRICT_MM_TYPECHECKS
+/* These are used to make use of C type-checking. */
+
+/* PTE level */
+typedef struct { pte_basic_t pte; } pte_t;
+#define __pte(x)	((pte_t) { (x) })
+static inline pte_basic_t pte_val(pte_t x)
+{
+	return x.pte;
+}
+
+/* PMD level */
+#ifdef CONFIG_PPC64
+typedef struct { unsigned long pmd; } pmd_t;
+#define __pmd(x)	((pmd_t) { (x) })
+static inline unsigned long pmd_val(pmd_t x)
+{
+	return x.pmd;
+}
+
+/*
+ * 64 bit hash always use 4 level table. Everybody else use 4 level
+ * only for 4K page size.
+ */
+#if defined(CONFIG_PPC_BOOK3S_64) || !defined(CONFIG_PPC_64K_PAGES)
+typedef struct { unsigned long pud; } pud_t;
+#define __pud(x)	((pud_t) { (x) })
+static inline unsigned long pud_val(pud_t x)
+{
+	return x.pud;
+}
+#endif /* CONFIG_PPC_BOOK3S_64 || !CONFIG_PPC_64K_PAGES */
+#endif /* CONFIG_PPC64 */
+
+/* PGD level */
+typedef struct { unsigned long pgd; } pgd_t;
+#define __pgd(x)	((pgd_t) { (x) })
+static inline unsigned long pgd_val(pgd_t x)
+{
+	return x.pgd;
+}
+
+/* Page protection bits */
+typedef struct { unsigned long pgprot; } pgprot_t;
+#define pgprot_val(x)	((x).pgprot)
+#define __pgprot(x)	((pgprot_t) { (x) })
+
+#else
+
+/*
+ * .. while these make it easier on the compiler
+ */
+
+typedef pte_basic_t pte_t;
+#define __pte(x)	(x)
+static inline pte_basic_t pte_val(pte_t pte)
+{
+	return pte;
+}
+
+#ifdef CONFIG_PPC64
+typedef unsigned long pmd_t;
+#define __pmd(x)	(x)
+static inline unsigned long pmd_val(pmd_t pmd)
+{
+	return pmd;
+}
+
+#if defined(CONFIG_PPC_BOOK3S_64) || !defined(CONFIG_PPC_64K_PAGES)
+typedef unsigned long pud_t;
+#define __pud(x)	(x)
+static inline unsigned long pud_val(pud_t pud)
+{
+	return pud;
+}
+#endif /* CONFIG_PPC_BOOK3S_64 || !CONFIG_PPC_64K_PAGES */
+#endif /* CONFIG_PPC64 */
+
+typedef unsigned long pgd_t;
+#define __pgd(x)	(x)
+static inline unsigned long pgd_val(pgd_t pgd)
+{
+	return pgd;
+}
+
+typedef unsigned long pgprot_t;
+#define pgprot_val(x)	(x)
+#define __pgprot(x)	(x)
+
+#endif /* CONFIG_STRICT_MM_TYPECHECKS */
+/*
+ * With hash config 64k pages additionally define a bigger "real PTE" type that
+ * gathers the "second half" part of the PTE for pseudo 64k pages
+ */
+#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_PPC_STD_MMU_64)
+typedef struct { pte_t pte; unsigned long hidx; } real_pte_t;
+#else
+typedef struct { pte_t pte; } real_pte_t;
+#endif
+#endif /* _ASM_POWERPC_PGTABLE_TYPES_H */
diff --git a/arch/powerpc/include/asm/pmac_feature.h b/arch/powerpc/include/asm/pmac_feature.h
index 10902c9375d0..925697968946 100644
--- a/arch/powerpc/include/asm/pmac_feature.h
+++ b/arch/powerpc/include/asm/pmac_feature.h
@@ -46,7 +46,7 @@
 
 /* PowerSurge are the first generation of PCI Pmacs. This include
  * all of the Grand-Central based machines. We currently don't
- * differenciate most of them.
+ * differentiate most of them.
  */
 #define PMAC_TYPE_PSURGE		0x10	/* PowerSurge */
 #define PMAC_TYPE_ANS			0x11	/* Apple Network Server */
diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h
index ac2330820b9a..8ab8a1a9610a 100644
--- a/arch/powerpc/include/asm/processor.h
+++ b/arch/powerpc/include/asm/processor.h
@@ -236,7 +236,9 @@ struct thread_struct {
 #endif
 	struct arch_hw_breakpoint hw_brk; /* info on the hardware breakpoint */
 	unsigned long	trap_nr;	/* last trap # on this thread */
+	u8 load_fp;
 #ifdef CONFIG_ALTIVEC
+	u8 load_vec;
 	struct thread_vr_state vr_state;
 	struct thread_vr_state *vr_save_area;
 	unsigned long	vrsave;
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index c4cb2ffc624e..52ed654d01ba 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -75,6 +75,14 @@
 #define MSR_HV		0
 #endif
 
+/*
+ * To be used in shared book E/book S, this avoids needing to worry about
+ * book S/book E in shared code
+ */
+#ifndef MSR_SPE
+#define MSR_SPE 	0
+#endif
+
 #define MSR_VEC		__MASK(MSR_VEC_LG)	/* Enable AltiVec */
 #define MSR_VSX		__MASK(MSR_VSX_LG)	/* Enable VSX */
 #define MSR_POW		__MASK(MSR_POW_LG)	/* Enable Power Management */
@@ -376,7 +384,7 @@
 #define SPRN_TSCR	0x399	/* Thread Switch Control Register */
 
 #define SPRN_DEC	0x016		/* Decrement Register */
-#define SPRN_DER	0x095		/* Debug Enable Regsiter */
+#define SPRN_DER	0x095		/* Debug Enable Register */
 #define DER_RSTE	0x40000000	/* Reset Interrupt */
 #define DER_CHSTPE	0x20000000	/* Check Stop */
 #define DER_MCIE	0x10000000	/* Machine Check Interrupt */
@@ -401,7 +409,7 @@
 #define SPRN_DPDES	0x0B0		/* Directed Priv. Doorbell Exc. State */
 #define SPRN_EAR	0x11A		/* External Address Register */
 #define SPRN_HASH1	0x3D2		/* Primary Hash Address Register */
-#define SPRN_HASH2	0x3D3		/* Secondary Hash Address Resgister */
+#define SPRN_HASH2	0x3D3		/* Secondary Hash Address Register */
 #define SPRN_HID0	0x3F0		/* Hardware Implementation Register 0 */
 #define HID0_HDICE_SH	(63 - 23)	/* 970 HDEC interrupt enable */
 #define HID0_EMCP	(1<<31)		/* Enable Machine Check pin */
@@ -514,7 +522,7 @@
 #define ICTRL_EICP	0x00000100	/* enable icache par. check */
 #define SPRN_IMISS	0x3D4		/* Instruction TLB Miss Register */
 #define SPRN_IMMR	0x27E		/* Internal Memory Map Register */
-#define SPRN_L2CR	0x3F9		/* Level 2 Cache Control Regsiter */
+#define SPRN_L2CR	0x3F9		/* Level 2 Cache Control Register */
 #define SPRN_L2CR2	0x3f8
 #define L2CR_L2E		0x80000000	/* L2 enable */
 #define L2CR_L2PE		0x40000000	/* L2 parity enable */
@@ -549,7 +557,7 @@
 #define L2CR_L2DO_745x		0x00010000	/* L2 data only (745x) */
 #define L2CR_L2REP_745x		0x00001000	/* L2 repl. algorithm (745x) */
 #define L2CR_L2HWF_745x		0x00000800	/* L2 hardware flush (745x) */
-#define SPRN_L3CR		0x3FA	/* Level 3 Cache Control Regsiter */
+#define SPRN_L3CR		0x3FA	/* Level 3 Cache Control Register */
 #define L3CR_L3E		0x80000000	/* L3 enable */
 #define L3CR_L3PE		0x40000000	/* L3 data parity enable */
 #define L3CR_L3APE		0x20000000	/* L3 addr parity enable */
diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h
index 2fef74b474f0..737e012ef56e 100644
--- a/arch/powerpc/include/asm/reg_booke.h
+++ b/arch/powerpc/include/asm/reg_booke.h
@@ -681,7 +681,7 @@
 #define SPRN_CDBCR	0x3D7	/* Cache Debug Control Register */
 #define SPRN_TBHI	0x3DC	/* Time Base High */
 #define SPRN_TBLO	0x3DD	/* Time Base Low */
-#define SPRN_DBCR	0x3F2	/* Debug Control Regsiter */
+#define SPRN_DBCR	0x3F2	/* Debug Control Register */
 #define SPRN_PBL1	0x3FC	/* Protection Bound Lower 1 */
 #define SPRN_PBL2	0x3FE	/* Protection Bound Lower 2 */
 #define SPRN_PBU1	0x3FD	/* Protection Bound Upper 1 */
diff --git a/arch/powerpc/include/asm/smu.h b/arch/powerpc/include/asm/smu.h
index 37d2da6feabf..f280dd11243f 100644
--- a/arch/powerpc/include/asm/smu.h
+++ b/arch/powerpc/include/asm/smu.h
@@ -154,7 +154,7 @@
   *
   * The Darwin I2C driver is less subtle though. On any non-success status
   * from the response command, it waits 5ms and tries again up to 20 times,
-  * it doesn't differenciate between fatal errors or "busy" status.
+  * it doesn't differentiate between fatal errors or "busy" status.
   *
   * This driver provides an asynchronous paramblock based i2c command
   * interface to be used either directly by low level code or by a higher
diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h
index 5b268b6be74c..17c8380673a6 100644
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -28,12 +28,14 @@ extern void giveup_all(struct task_struct *);
 extern void enable_kernel_fp(void);
 extern void flush_fp_to_thread(struct task_struct *);
 extern void giveup_fpu(struct task_struct *);
-extern void __giveup_fpu(struct task_struct *);
+extern void save_fpu(struct task_struct *);
 static inline void disable_kernel_fp(void)
 {
 	msr_check_and_clear(MSR_FP);
 }
 #else
+static inline void __giveup_fpu(struct task_struct *t) { }
+static inline void save_fpu(struct task_struct *t) { }
 static inline void flush_fp_to_thread(struct task_struct *t) { }
 #endif
 
@@ -41,18 +43,19 @@ static inline void flush_fp_to_thread(struct task_struct *t) { }
 extern void enable_kernel_altivec(void);
 extern void flush_altivec_to_thread(struct task_struct *);
 extern void giveup_altivec(struct task_struct *);
-extern void __giveup_altivec(struct task_struct *);
+extern void save_altivec(struct task_struct *);
 static inline void disable_kernel_altivec(void)
 {
 	msr_check_and_clear(MSR_VEC);
 }
+#else
+static inline void save_altivec(struct task_struct *t) { }
+static inline void __giveup_altivec(struct task_struct *t) { }
 #endif
 
 #ifdef CONFIG_VSX
 extern void enable_kernel_vsx(void);
 extern void flush_vsx_to_thread(struct task_struct *);
-extern void giveup_vsx(struct task_struct *);
-extern void __giveup_vsx(struct task_struct *);
 static inline void disable_kernel_vsx(void)
 {
 	msr_check_and_clear(MSR_FP|MSR_VEC|MSR_VSX);
@@ -68,6 +71,8 @@ static inline void disable_kernel_spe(void)
 {
 	msr_check_and_clear(MSR_SPE);
 }
+#else
+static inline void __giveup_spe(struct task_struct *t) { }
 #endif
 
 static inline void clear_task_ebb(struct task_struct *t)
diff --git a/arch/powerpc/include/asm/tlbflush.h b/arch/powerpc/include/asm/tlbflush.h
index 23d351ca0303..9f77f85e3e99 100644
--- a/arch/powerpc/include/asm/tlbflush.h
+++ b/arch/powerpc/include/asm/tlbflush.h
@@ -78,97 +78,7 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm)
 }
 
 #elif defined(CONFIG_PPC_STD_MMU_64)
-
-#define MMU_NO_CONTEXT		0
-
-/*
- * TLB flushing for 64-bit hash-MMU CPUs
- */
-
-#include <linux/percpu.h>
-#include <asm/page.h>
-
-#define PPC64_TLB_BATCH_NR 192
-
-struct ppc64_tlb_batch {
-	int			active;
-	unsigned long		index;
-	struct mm_struct	*mm;
-	real_pte_t		pte[PPC64_TLB_BATCH_NR];
-	unsigned long		vpn[PPC64_TLB_BATCH_NR];
-	unsigned int		psize;
-	int			ssize;
-};
-DECLARE_PER_CPU(struct ppc64_tlb_batch, ppc64_tlb_batch);
-
-extern void __flush_tlb_pending(struct ppc64_tlb_batch *batch);
-
-#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
-
-static inline void arch_enter_lazy_mmu_mode(void)
-{
-	struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch);
-
-	batch->active = 1;
-}
-
-static inline void arch_leave_lazy_mmu_mode(void)
-{
-	struct ppc64_tlb_batch *batch = this_cpu_ptr(&ppc64_tlb_batch);
-
-	if (batch->index)
-		__flush_tlb_pending(batch);
-	batch->active = 0;
-}
-
-#define arch_flush_lazy_mmu_mode()      do {} while (0)
-
-
-extern void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize,
-			    int ssize, unsigned long flags);
-extern void flush_hash_range(unsigned long number, int local);
-extern void flush_hash_hugepage(unsigned long vsid, unsigned long addr,
-				pmd_t *pmdp, unsigned int psize, int ssize,
-				unsigned long flags);
-
-static inline void local_flush_tlb_mm(struct mm_struct *mm)
-{
-}
-
-static inline void flush_tlb_mm(struct mm_struct *mm)
-{
-}
-
-static inline void local_flush_tlb_page(struct vm_area_struct *vma,
-					unsigned long vmaddr)
-{
-}
-
-static inline void flush_tlb_page(struct vm_area_struct *vma,
-				  unsigned long vmaddr)
-{
-}
-
-static inline void flush_tlb_page_nohash(struct vm_area_struct *vma,
-					 unsigned long vmaddr)
-{
-}
-
-static inline void flush_tlb_range(struct vm_area_struct *vma,
-				   unsigned long start, unsigned long end)
-{
-}
-
-static inline void flush_tlb_kernel_range(unsigned long start,
-					  unsigned long end)
-{
-}
-
-/* Private function for use by PCI IO mapping code */
-extern void __flush_hash_table_range(struct mm_struct *mm, unsigned long start,
-				     unsigned long end);
-extern void flush_tlb_pmd_range(struct mm_struct *mm, pmd_t *pmd,
-				unsigned long addr);
+#include <asm/book3s/64/tlbflush-hash.h>
 #else
 #error Unsupported MMU type
 #endif
diff --git a/arch/powerpc/include/asm/trace.h b/arch/powerpc/include/asm/trace.h
index 8e86b48d0369..32e36b16773f 100644
--- a/arch/powerpc/include/asm/trace.h
+++ b/arch/powerpc/include/asm/trace.h
@@ -57,12 +57,14 @@ DEFINE_EVENT(ppc64_interrupt_class, timer_interrupt_exit,
 extern void hcall_tracepoint_regfunc(void);
 extern void hcall_tracepoint_unregfunc(void);
 
-TRACE_EVENT_FN(hcall_entry,
+TRACE_EVENT_FN_COND(hcall_entry,
 
 	TP_PROTO(unsigned long opcode, unsigned long *args),
 
 	TP_ARGS(opcode, args),
 
+	TP_CONDITION(cpu_online(raw_smp_processor_id())),
+
 	TP_STRUCT__entry(
 		__field(unsigned long, opcode)
 	),
@@ -76,13 +78,15 @@ TRACE_EVENT_FN(hcall_entry,
 	hcall_tracepoint_regfunc, hcall_tracepoint_unregfunc
 );
 
-TRACE_EVENT_FN(hcall_exit,
+TRACE_EVENT_FN_COND(hcall_exit,
 
 	TP_PROTO(unsigned long opcode, unsigned long retval,
 		unsigned long *retbuf),
 
 	TP_ARGS(opcode, retval, retbuf),
 
+	TP_CONDITION(cpu_online(raw_smp_processor_id())),
+
 	TP_STRUCT__entry(
 		__field(unsigned long, opcode)
 		__field(unsigned long, retval)
diff --git a/arch/powerpc/include/asm/uninorth.h b/arch/powerpc/include/asm/uninorth.h
index d12b11d7641e..a1d112979fd2 100644
--- a/arch/powerpc/include/asm/uninorth.h
+++ b/arch/powerpc/include/asm/uninorth.h
@@ -132,7 +132,7 @@
 
 /* This one _might_ return the CPU number of the CPU reading it;
  * the bootROM decides whether to boot or to sleep/spinloop depending
- * on this register beeing 0 or not
+ * on this register being 0 or not
  */
 #define UNI_N_CPU_NUMBER		0x0050
 
diff --git a/arch/powerpc/include/asm/xics.h b/arch/powerpc/include/asm/xics.h
index 0e25bdb190bb..5d61bbced6a1 100644
--- a/arch/powerpc/include/asm/xics.h
+++ b/arch/powerpc/include/asm/xics.h
@@ -1,5 +1,5 @@
 /*
- * Common definitions accross all variants of ICP and ICS interrupt
+ * Common definitions across all variants of ICP and ICS interrupt
  * controllers.
  */
 
diff --git a/arch/powerpc/include/uapi/asm/epapr_hcalls.h b/arch/powerpc/include/uapi/asm/epapr_hcalls.h
index 7f9c74b46704..b4504f394427 100644
--- a/arch/powerpc/include/uapi/asm/epapr_hcalls.h
+++ b/arch/powerpc/include/uapi/asm/epapr_hcalls.h
@@ -78,7 +78,7 @@
 #define EV_SUCCESS		0
 #define EV_EPERM		1	/* Operation not permitted */
 #define EV_ENOENT		2	/*  Entry Not Found */
-#define EV_EIO			3	/* I/O error occured */
+#define EV_EIO			3	/* I/O error occurred */
 #define EV_EAGAIN		4	/* The operation had insufficient
 					 * resources to complete and should be
 					 * retried
@@ -89,7 +89,7 @@
 #define EV_ENODEV		7	/* No such device */
 #define EV_EINVAL		8	/* An argument supplied to the hcall
 					   was out of range or invalid */
-#define EV_INTERNAL		9	/* An internal error occured */
+#define EV_INTERNAL		9	/* An internal error occurred */
 #define EV_CONFIG		10	/* A configuration error was detected */
 #define EV_INVALID_STATE	11	/* The object is in an invalid state */
 #define EV_UNIMPLEMENTED	12	/* Unimplemented hypercall */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 07cebc3514f3..10d5eab19458 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -95,12 +95,14 @@ int main(void)
 	DEFINE(THREAD_FPSTATE, offsetof(struct thread_struct, fp_state));
 	DEFINE(THREAD_FPSAVEAREA, offsetof(struct thread_struct, fp_save_area));
 	DEFINE(FPSTATE_FPSCR, offsetof(struct thread_fp_state, fpscr));
+	DEFINE(THREAD_LOAD_FP, offsetof(struct thread_struct, load_fp));
 #ifdef CONFIG_ALTIVEC
 	DEFINE(THREAD_VRSTATE, offsetof(struct thread_struct, vr_state));
 	DEFINE(THREAD_VRSAVEAREA, offsetof(struct thread_struct, vr_save_area));
 	DEFINE(THREAD_VRSAVE, offsetof(struct thread_struct, vrsave));
 	DEFINE(THREAD_USED_VR, offsetof(struct thread_struct, used_vr));
 	DEFINE(VRSTATE_VSCR, offsetof(struct thread_vr_state, vscr));
+	DEFINE(THREAD_LOAD_VEC, offsetof(struct thread_struct, load_vec));
 #endif /* CONFIG_ALTIVEC */
 #ifdef CONFIG_VSX
 	DEFINE(THREAD_USED_VSR, offsetof(struct thread_struct, used_vsr));
diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S
index 9c9b7411b28b..584e119fa8b0 100644
--- a/arch/powerpc/kernel/cpu_setup_power.S
+++ b/arch/powerpc/kernel/cpu_setup_power.S
@@ -15,6 +15,7 @@
 #include <asm/ppc_asm.h>
 #include <asm/asm-offsets.h>
 #include <asm/cache.h>
+#include <asm/book3s/64/mmu-hash.h>
 
 /* Entry: r3 = crap, r4 = ptr to cputable entry
  *
@@ -83,6 +84,39 @@ _GLOBAL(__restore_cpu_power8)
 	mtlr	r11
 	blr
 
+_GLOBAL(__setup_cpu_power9)
+	mflr	r11
+	bl	__init_FSCR
+	bl	__init_hvmode_206
+	mtlr	r11
+	beqlr
+	li	r0,0
+	mtspr	SPRN_LPID,r0
+	mfspr	r3,SPRN_LPCR
+	ori	r3, r3, LPCR_PECEDH
+	bl	__init_LPCR
+	bl	__init_HFSCR
+	bl	__init_tlb_power9
+	mtlr	r11
+	blr
+
+_GLOBAL(__restore_cpu_power9)
+	mflr	r11
+	bl	__init_FSCR
+	mfmsr	r3
+	rldicl.	r0,r3,4,63
+	mtlr	r11
+	beqlr
+	li	r0,0
+	mtspr	SPRN_LPID,r0
+	mfspr   r3,SPRN_LPCR
+	ori	r3, r3, LPCR_PECEDH
+	bl	__init_LPCR
+	bl	__init_HFSCR
+	bl	__init_tlb_power9
+	mtlr	r11
+	blr
+
 __init_hvmode_206:
 	/* Disable CPU_FTR_HVMODE and exit if MSR:HV is not set */
 	mfmsr	r3
@@ -139,7 +173,7 @@ __init_HFSCR:
  * (invalidate by congruence class). P7 has 128 CCs., P8 has 512.
  */
 __init_tlb_power7:
-	li	r6,128
+	li	r6,POWER7_TLB_SETS
 	mtctr	r6
 	li	r7,0xc00	/* IS field = 0b11 */
 	ptesync
@@ -150,7 +184,18 @@ __init_tlb_power7:
 1:	blr
 
 __init_tlb_power8:
-	li	r6,512
+	li	r6,POWER8_TLB_SETS
+	mtctr	r6
+	li	r7,0xc00	/* IS field = 0b11 */
+	ptesync
+2:	tlbiel	r7
+	addi	r7,r7,0x1000
+	bdnz	2b
+	ptesync
+1:	blr
+
+__init_tlb_power9:
+	li	r6,POWER9_TLB_SETS_HASH
 	mtctr	r6
 	li	r7,0xc00	/* IS field = 0b11 */
 	ptesync
diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c
index 7d80bfdfb15e..be4d73053bed 100644
--- a/arch/powerpc/kernel/cputable.c
+++ b/arch/powerpc/kernel/cputable.c
@@ -70,9 +70,12 @@ extern void __setup_cpu_power7(unsigned long offset, struct cpu_spec* spec);
 extern void __restore_cpu_power7(void);
 extern void __setup_cpu_power8(unsigned long offset, struct cpu_spec* spec);
 extern void __restore_cpu_power8(void);
+extern void __setup_cpu_power9(unsigned long offset, struct cpu_spec* spec);
+extern void __restore_cpu_power9(void);
 extern void __restore_cpu_a2(void);
 extern void __flush_tlb_power7(unsigned int action);
 extern void __flush_tlb_power8(unsigned int action);
+extern void __flush_tlb_power9(unsigned int action);
 extern long __machine_check_early_realmode_p7(struct pt_regs *regs);
 extern long __machine_check_early_realmode_p8(struct pt_regs *regs);
 #endif /* CONFIG_PPC64 */
@@ -116,6 +119,11 @@ extern void __restore_cpu_e6500(void);
 #define COMMON_USER_PA6T	(COMMON_USER_PPC64 | PPC_FEATURE_PA6T |\
 				 PPC_FEATURE_TRUE_LE | \
 				 PPC_FEATURE_HAS_ALTIVEC_COMP)
+#define COMMON_USER_POWER9	COMMON_USER_POWER8
+#define COMMON_USER2_POWER9	(COMMON_USER2_POWER8 | \
+				 PPC_FEATURE2_ARCH_3_00 | \
+				 PPC_FEATURE2_HAS_IEEE128)
+
 #ifdef CONFIG_PPC_BOOK3E_64
 #define COMMON_USER_BOOKE	(COMMON_USER_PPC64 | PPC_FEATURE_BOOKE)
 #else
@@ -499,6 +507,25 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.machine_check_early	= __machine_check_early_realmode_p8,
 		.platform		= "power8",
 	},
+	{	/* Power9 */
+		.pvr_mask		= 0xffff0000,
+		.pvr_value		= 0x004e0000,
+		.cpu_name		= "POWER9 (raw)",
+		.cpu_features		= CPU_FTRS_POWER9,
+		.cpu_user_features	= COMMON_USER_POWER9,
+		.cpu_user_features2	= COMMON_USER2_POWER9,
+		.mmu_features		= MMU_FTRS_POWER9,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_IBM,
+		.oprofile_cpu_type	= "ppc64/power9",
+		.oprofile_type		= PPC_OPROFILE_INVALID,
+		.cpu_setup		= __setup_cpu_power9,
+		.cpu_restore		= __restore_cpu_power9,
+		.flush_tlb		= __flush_tlb_power9,
+		.platform		= "power9",
+	},
 	{	/* Cell Broadband Engine */
 		.pvr_mask		= 0xffff0000,
 		.pvr_value		= 0x00700000,
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 40e4d4a27663..6544017eb90b 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -268,13 +268,6 @@ static void *eeh_dump_pe_log(void *data, void *flag)
 	struct eeh_dev *edev, *tmp;
 	size_t *plen = flag;
 
-	/* If the PE's config space is blocked, 0xFF's will be
-	 * returned. It's pointless to collect the log in this
-	 * case.
-	 */
-	if (pe->state & EEH_PE_CFG_BLOCKED)
-		return NULL;
-
 	eeh_pe_for_each_dev(pe, edev, tmp)
 		*plen += eeh_dump_dev_log(edev, pci_regs_buf + *plen,
 					  EEH_PCI_REGS_LOG_LEN - *plen);
@@ -677,7 +670,7 @@ int eeh_pci_enable(struct eeh_pe *pe, int function)
 	/* Check if the request is finished successfully */
 	if (active_flag) {
 		rc = eeh_ops->wait_state(pe, PCI_BUS_RESET_WAIT_MSEC);
-		if (rc <= 0)
+		if (rc < 0)
 			return rc;
 
 		if (rc & active_flag)
@@ -739,7 +732,7 @@ static void *eeh_restore_dev_state(void *data, void *userdata)
 }
 
 /**
- * pcibios_set_pcie_slot_reset - Set PCI-E reset state
+ * pcibios_set_pcie_reset_state - Set PCI-E reset state
  * @dev: pci device struct
  * @state: reset state to enter
  *
@@ -761,7 +754,8 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat
 	case pcie_deassert_reset:
 		eeh_ops->reset(pe, EEH_RESET_DEACTIVATE);
 		eeh_unfreeze_pe(pe, false);
-		eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED);
+		if (!(pe->type & EEH_PE_VF))
+			eeh_pe_state_clear(pe, EEH_PE_CFG_BLOCKED);
 		eeh_pe_dev_traverse(pe, eeh_restore_dev_state, dev);
 		eeh_pe_state_clear(pe, EEH_PE_ISOLATED);
 		break;
@@ -769,14 +763,16 @@ int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state stat
 		eeh_pe_state_mark_with_cfg(pe, EEH_PE_ISOLATED);
 		eeh_ops->set_option(pe, EEH_OPT_FREEZE_PE);
 		eeh_pe_dev_traverse(pe, eeh_disable_and_save_dev_state, dev);
-		eeh_pe_state_mark(pe, EEH_PE_CFG_BLOCKED);
+		if (!(pe->type & EEH_PE_VF))
+			eeh_pe_state_mark(pe, EEH_PE_CFG_BLOCKED);
 		eeh_ops->reset(pe, EEH_RESET_HOT);
 		break;
 	case pcie_warm_reset:
 		eeh_pe_state_mark_with_cfg(pe, EEH_PE_ISOLATED);
 		eeh_ops->set_option(pe, EEH_OPT_FREEZE_PE);
 		eeh_pe_dev_traverse(pe, eeh_disable_and_save_dev_state, dev);
-		eeh_pe_state_mark(pe, EEH_PE_CFG_BLOCKED);
+		if (!(pe->type & EEH_PE_VF))
+			eeh_pe_state_mark(pe, EEH_PE_CFG_BLOCKED);
 		eeh_ops->reset(pe, EEH_RESET_FUNDAMENTAL);
 		break;
 	default:
@@ -1243,6 +1239,14 @@ void eeh_remove_device(struct pci_dev *dev)
 	 * from the parent PE during the BAR resotre.
 	 */
 	edev->pdev = NULL;
+
+	/*
+	 * The flag "in_error" is used to trace EEH devices for VFs
+	 * in error state or not. It's set in eeh_report_error(). If
+	 * it's not set, eeh_report_{reset,resume}() won't be called
+	 * for the VF EEH device.
+	 */
+	edev->in_error = false;
 	dev->dev.archdata.edev = NULL;
 	if (!(edev->pe->state & EEH_PE_KEEP))
 		eeh_rmv_from_parent_pe(edev);
@@ -1537,6 +1541,17 @@ int eeh_pe_get_state(struct eeh_pe *pe)
 	if (!eeh_ops || !eeh_ops->get_state)
 		return -ENOENT;
 
+	/*
+	 * If the parent PE is owned by the host kernel and is undergoing
+	 * error recovery, we should return the PE state as temporarily
+	 * unavailable so that the error recovery on the guest is suspended
+	 * until the recovery completes on the host.
+	 */
+	if (pe->parent &&
+	    !(pe->state & EEH_PE_REMOVED) &&
+	    (pe->parent->state & (EEH_PE_ISOLATED | EEH_PE_RECOVERING)))
+		return EEH_PE_STATE_UNAVAIL;
+
 	result = eeh_ops->get_state(pe, NULL);
 	rst_active = !!(result & EEH_STATE_RESET_ACTIVE);
 	dma_en = !!(result & EEH_STATE_DMA_ENABLED);
diff --git a/arch/powerpc/kernel/eeh_cache.c b/arch/powerpc/kernel/eeh_cache.c
index a1e86e172e3c..ddbcfab7efdf 100644
--- a/arch/powerpc/kernel/eeh_cache.c
+++ b/arch/powerpc/kernel/eeh_cache.c
@@ -195,8 +195,11 @@ static void __eeh_addr_cache_insert_dev(struct pci_dev *dev)
 		return;
 	}
 
-	/* Walk resources on this device, poke them into the tree */
-	for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
+	/*
+	 * Walk resources on this device, poke the first 7 (6 normal BAR and 1
+	 * ROM BAR) into the tree.
+	 */
+	for (i = 0; i <= PCI_ROM_RESOURCE; i++) {
 		resource_size_t start = pci_resource_start(dev,i);
 		resource_size_t end = pci_resource_end(dev,i);
 		unsigned long flags = pci_resource_flags(dev,i);
@@ -222,10 +225,6 @@ void eeh_addr_cache_insert_dev(struct pci_dev *dev)
 {
 	unsigned long flags;
 
-	/* Ignore PCI bridges */
-	if ((dev->class >> 16) == PCI_BASE_CLASS_BRIDGE)
-		return;
-
 	spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags);
 	__eeh_addr_cache_insert_dev(dev);
 	spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags);
diff --git a/arch/powerpc/kernel/eeh_dev.c b/arch/powerpc/kernel/eeh_dev.c
index aabba94ff9cb..7815095fe3d8 100644
--- a/arch/powerpc/kernel/eeh_dev.c
+++ b/arch/powerpc/kernel/eeh_dev.c
@@ -67,6 +67,7 @@ void *eeh_dev_init(struct pci_dn *pdn, void *data)
 	edev->pdn = pdn;
 	edev->phb = phb;
 	INIT_LIST_HEAD(&edev->list);
+	INIT_LIST_HEAD(&edev->rmv_list);
 
 	return NULL;
 }
diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c
index 938742135ee0..fb6207d2c604 100644
--- a/arch/powerpc/kernel/eeh_driver.c
+++ b/arch/powerpc/kernel/eeh_driver.c
@@ -34,6 +34,11 @@
 #include <asm/prom.h>
 #include <asm/rtas.h>
 
+struct eeh_rmv_data {
+	struct list_head edev_list;
+	int removed;
+};
+
 /**
  * eeh_pcid_name - Retrieve name of PCI device driver
  * @pdev: PCI device
@@ -190,7 +195,7 @@ static void *eeh_report_error(void *data, void *userdata)
 	enum pci_ers_result rc, *res = userdata;
 	struct pci_driver *driver;
 
-	if (!dev || eeh_dev_removed(edev))
+	if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
 		return NULL;
 	dev->error_state = pci_channel_io_frozen;
 
@@ -211,6 +216,7 @@ static void *eeh_report_error(void *data, void *userdata)
 	if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc;
 	if (*res == PCI_ERS_RESULT_NONE) *res = rc;
 
+	edev->in_error = true;
 	eeh_pcid_put(dev);
 	return NULL;
 }
@@ -231,7 +237,7 @@ static void *eeh_report_mmio_enabled(void *data, void *userdata)
 	enum pci_ers_result rc, *res = userdata;
 	struct pci_driver *driver;
 
-	if (!dev || eeh_dev_removed(edev))
+	if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
 		return NULL;
 
 	driver = eeh_pcid_get(dev);
@@ -271,7 +277,7 @@ static void *eeh_report_reset(void *data, void *userdata)
 	enum pci_ers_result rc, *res = userdata;
 	struct pci_driver *driver;
 
-	if (!dev || eeh_dev_removed(edev))
+	if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
 		return NULL;
 	dev->error_state = pci_channel_io_normal;
 
@@ -282,7 +288,8 @@ static void *eeh_report_reset(void *data, void *userdata)
 
 	if (!driver->err_handler ||
 	    !driver->err_handler->slot_reset ||
-	    (edev->mode & EEH_DEV_NO_HANDLER)) {
+	    (edev->mode & EEH_DEV_NO_HANDLER) ||
+	    (!edev->in_error)) {
 		eeh_pcid_put(dev);
 		return NULL;
 	}
@@ -326,20 +333,23 @@ static void *eeh_report_resume(void *data, void *userdata)
 {
 	struct eeh_dev *edev = (struct eeh_dev *)data;
 	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+	bool was_in_error;
 	struct pci_driver *driver;
 
-	if (!dev || eeh_dev_removed(edev))
+	if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
 		return NULL;
 	dev->error_state = pci_channel_io_normal;
 
 	driver = eeh_pcid_get(dev);
 	if (!driver) return NULL;
 
+	was_in_error = edev->in_error;
+	edev->in_error = false;
 	eeh_enable_irq(dev);
 
 	if (!driver->err_handler ||
 	    !driver->err_handler->resume ||
-	    (edev->mode & EEH_DEV_NO_HANDLER)) {
+	    (edev->mode & EEH_DEV_NO_HANDLER) || !was_in_error) {
 		edev->mode &= ~EEH_DEV_NO_HANDLER;
 		eeh_pcid_put(dev);
 		return NULL;
@@ -365,7 +375,7 @@ static void *eeh_report_failure(void *data, void *userdata)
 	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
 	struct pci_driver *driver;
 
-	if (!dev || eeh_dev_removed(edev))
+	if (!dev || eeh_dev_removed(edev) || eeh_pe_passed(edev->pe))
 		return NULL;
 	dev->error_state = pci_channel_io_perm_failure;
 
@@ -386,12 +396,40 @@ static void *eeh_report_failure(void *data, void *userdata)
 	return NULL;
 }
 
+static void *eeh_add_virt_device(void *data, void *userdata)
+{
+	struct pci_driver *driver;
+	struct eeh_dev *edev = (struct eeh_dev *)data;
+	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
+	struct pci_dn *pdn = eeh_dev_to_pdn(edev);
+
+	if (!(edev->physfn)) {
+		pr_warn("%s: EEH dev %04x:%02x:%02x.%01x not for VF\n",
+			__func__, edev->phb->global_number, pdn->busno,
+			PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
+		return NULL;
+	}
+
+	driver = eeh_pcid_get(dev);
+	if (driver) {
+		eeh_pcid_put(dev);
+		if (driver->err_handler)
+			return NULL;
+	}
+
+#ifdef CONFIG_PPC_POWERNV
+	pci_iov_add_virtfn(edev->physfn, pdn->vf_index, 0);
+#endif
+	return NULL;
+}
+
 static void *eeh_rmv_device(void *data, void *userdata)
 {
 	struct pci_driver *driver;
 	struct eeh_dev *edev = (struct eeh_dev *)data;
 	struct pci_dev *dev = eeh_dev_to_pci_dev(edev);
-	int *removed = (int *)userdata;
+	struct eeh_rmv_data *rmv_data = (struct eeh_rmv_data *)userdata;
+	int *removed = rmv_data ? &rmv_data->removed : NULL;
 
 	/*
 	 * Actually, we should remove the PCI bridges as well.
@@ -416,10 +454,13 @@ static void *eeh_rmv_device(void *data, void *userdata)
 	driver = eeh_pcid_get(dev);
 	if (driver) {
 		eeh_pcid_put(dev);
-		if (driver->err_handler &&
+		if (removed &&
+		    eeh_pe_passed(edev->pe))
+			return NULL;
+		if (removed &&
+		    driver->err_handler &&
 		    driver->err_handler->error_detected &&
-		    driver->err_handler->slot_reset &&
-		    driver->err_handler->resume)
+		    driver->err_handler->slot_reset)
 			return NULL;
 	}
 
@@ -428,11 +469,29 @@ static void *eeh_rmv_device(void *data, void *userdata)
 		 pci_name(dev));
 	edev->bus = dev->bus;
 	edev->mode |= EEH_DEV_DISCONNECTED;
-	(*removed)++;
+	if (removed)
+		(*removed)++;
 
-	pci_lock_rescan_remove();
-	pci_stop_and_remove_bus_device(dev);
-	pci_unlock_rescan_remove();
+	if (edev->physfn) {
+#ifdef CONFIG_PPC_POWERNV
+		struct pci_dn *pdn = eeh_dev_to_pdn(edev);
+
+		pci_iov_remove_virtfn(edev->physfn, pdn->vf_index, 0);
+		edev->pdev = NULL;
+
+		/*
+		 * We have to set the VF PE number to invalid one, which is
+		 * required to plug the VF successfully.
+		 */
+		pdn->pe_number = IODA_INVALID_PE;
+#endif
+		if (rmv_data)
+			list_add(&edev->rmv_list, &rmv_data->edev_list);
+	} else {
+		pci_lock_rescan_remove();
+		pci_stop_and_remove_bus_device(dev);
+		pci_unlock_rescan_remove();
+	}
 
 	return NULL;
 }
@@ -546,11 +605,13 @@ int eeh_pe_reset_and_recover(struct eeh_pe *pe)
  * During the reset, udev might be invoked because those affected
  * PCI devices will be removed and then added.
  */
-static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
+static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus,
+				struct eeh_rmv_data *rmv_data)
 {
 	struct pci_bus *frozen_bus = eeh_pe_bus_get(pe);
 	struct timeval tstamp;
-	int cnt, rc, removed = 0;
+	int cnt, rc;
+	struct eeh_dev *edev;
 
 	/* pcibios will clear the counter; save the value */
 	cnt = pe->freeze_count;
@@ -564,11 +625,16 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
 	 */
 	eeh_pe_state_mark(pe, EEH_PE_KEEP);
 	if (bus) {
-		pci_lock_rescan_remove();
-		pcibios_remove_pci_devices(bus);
-		pci_unlock_rescan_remove();
+		if (pe->type & EEH_PE_VF) {
+			eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL);
+		} else {
+			eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
+			pci_lock_rescan_remove();
+			pcibios_remove_pci_devices(bus);
+			pci_unlock_rescan_remove();
+		}
 	} else if (frozen_bus) {
-		eeh_pe_dev_traverse(pe, eeh_rmv_device, &removed);
+		eeh_pe_dev_traverse(pe, eeh_rmv_device, &rmv_data);
 	}
 
 	/*
@@ -610,14 +676,22 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
 		 * PE. We should disconnect it so the binding can be
 		 * rebuilt when adding PCI devices.
 		 */
+		edev = list_first_entry(&pe->edevs, struct eeh_dev, list);
 		eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL);
-		pcibios_add_pci_devices(bus);
-	} else if (frozen_bus && removed) {
+		if (pe->type & EEH_PE_VF)
+			eeh_add_virt_device(edev, NULL);
+		else
+			pcibios_add_pci_devices(bus);
+	} else if (frozen_bus && rmv_data->removed) {
 		pr_info("EEH: Sleep 5s ahead of partial hotplug\n");
 		ssleep(5);
 
+		edev = list_first_entry(&pe->edevs, struct eeh_dev, list);
 		eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL);
-		pcibios_add_pci_devices(frozen_bus);
+		if (pe->type & EEH_PE_VF)
+			eeh_add_virt_device(edev, NULL);
+		else
+			pcibios_add_pci_devices(frozen_bus);
 	}
 	eeh_pe_state_clear(pe, EEH_PE_KEEP);
 
@@ -636,8 +710,10 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
 static void eeh_handle_normal_event(struct eeh_pe *pe)
 {
 	struct pci_bus *frozen_bus;
+	struct eeh_dev *edev, *tmp;
 	int rc = 0;
 	enum pci_ers_result result = PCI_ERS_RESULT_NONE;
+	struct eeh_rmv_data rmv_data = {LIST_HEAD_INIT(rmv_data.edev_list), 0};
 
 	frozen_bus = eeh_pe_bus_get(pe);
 	if (!frozen_bus) {
@@ -692,7 +768,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
 	 */
 	if (result == PCI_ERS_RESULT_NONE) {
 		pr_info("EEH: Reset with hotplug activity\n");
-		rc = eeh_reset_device(pe, frozen_bus);
+		rc = eeh_reset_device(pe, frozen_bus, NULL);
 		if (rc) {
 			pr_warn("%s: Unable to reset, err=%d\n",
 				__func__, rc);
@@ -744,7 +820,7 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
 	/* If any device called out for a reset, then reset the slot */
 	if (result == PCI_ERS_RESULT_NEED_RESET) {
 		pr_info("EEH: Reset without hotplug activity\n");
-		rc = eeh_reset_device(pe, NULL);
+		rc = eeh_reset_device(pe, NULL, &rmv_data);
 		if (rc) {
 			pr_warn("%s: Cannot reset, err=%d\n",
 				__func__, rc);
@@ -764,6 +840,15 @@ static void eeh_handle_normal_event(struct eeh_pe *pe)
 		goto hard_fail;
 	}
 
+	/*
+	 * For those hot removed VFs, we should add back them after PF get
+	 * recovered properly.
+	 */
+	list_for_each_entry_safe(edev, tmp, &rmv_data.edev_list, rmv_list) {
+		eeh_add_virt_device(edev, NULL);
+		list_del(&edev->rmv_list);
+	}
+
 	/* Tell all device drivers that they can resume operations */
 	pr_info("EEH: Notify device driver to resume\n");
 	eeh_pe_dev_traverse(pe, eeh_report_resume, NULL);
@@ -803,11 +888,17 @@ perm_error:
 	 * the their PCI config any more.
 	 */
 	if (frozen_bus) {
-		eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
+		if (pe->type & EEH_PE_VF) {
+			eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL);
+			eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
+		} else {
+			eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
+			eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
 
-		pci_lock_rescan_remove();
-		pcibios_remove_pci_devices(frozen_bus);
-		pci_unlock_rescan_remove();
+			pci_lock_rescan_remove();
+			pcibios_remove_pci_devices(frozen_bus);
+			pci_unlock_rescan_remove();
+		}
 	}
 }
 
@@ -886,6 +977,7 @@ static void eeh_handle_special_event(void)
 					continue;
 
 				/* Notify all devices to be down */
+				eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
 				bus = eeh_pe_bus_get(phb_pe);
 				eeh_pe_dev_traverse(pe,
 					eeh_report_failure, NULL);
diff --git a/arch/powerpc/kernel/eeh_pe.c b/arch/powerpc/kernel/eeh_pe.c
index ca9e5371930e..eea48d8baf49 100644
--- a/arch/powerpc/kernel/eeh_pe.c
+++ b/arch/powerpc/kernel/eeh_pe.c
@@ -299,7 +299,10 @@ static struct eeh_pe *eeh_pe_get_parent(struct eeh_dev *edev)
 	 * EEH device already having associated PE, but
 	 * the direct parent EEH device doesn't have yet.
 	 */
-	pdn = pdn ? pdn->parent : NULL;
+	if (edev->physfn)
+		pdn = pci_get_pdn(edev->physfn);
+	else
+		pdn = pdn ? pdn->parent : NULL;
 	while (pdn) {
 		/* We're poking out of PCI territory */
 		parent = pdn_to_eeh_dev(pdn);
@@ -382,7 +385,10 @@ int eeh_add_to_parent_pe(struct eeh_dev *edev)
 	}
 
 	/* Create a new EEH PE */
-	pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE);
+	if (edev->physfn)
+		pe = eeh_pe_alloc(edev->phb, EEH_PE_VF);
+	else
+		pe = eeh_pe_alloc(edev->phb, EEH_PE_DEVICE);
 	if (!pe) {
 		pr_err("%s: out of memory!\n", __func__);
 		return -ENOMEM;
@@ -920,25 +926,21 @@ const char *eeh_pe_loc_get(struct eeh_pe *pe)
  */
 struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe)
 {
-	struct pci_bus *bus = NULL;
 	struct eeh_dev *edev;
 	struct pci_dev *pdev;
 
-	if (pe->type & EEH_PE_PHB) {
-		bus = pe->phb->bus;
-	} else if (pe->type & EEH_PE_BUS ||
-		   pe->type & EEH_PE_DEVICE) {
-		if (pe->bus) {
-			bus = pe->bus;
-			goto out;
-		}
+	if (pe->type & EEH_PE_PHB)
+		return pe->phb->bus;
 
-		edev = list_first_entry(&pe->edevs, struct eeh_dev, list);
-		pdev = eeh_dev_to_pci_dev(edev);
-		if (pdev)
-			bus = pdev->bus;
-	}
+	/* The primary bus might be cached during probe time */
+	if (pe->state & EEH_PE_PRI_BUS)
+		return pe->bus;
 
-out:
-	return bus;
+	/* Retrieve the parent PCI bus of first (top) PCI device */
+	edev = list_first_entry_or_null(&pe->edevs, struct eeh_dev, list);
+	pdev = eeh_dev_to_pci_dev(edev);
+	if (pdev)
+		return pdev->bus;
+
+	return NULL;
 }
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index ec7f8aada697..b9b125327f27 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -210,7 +210,20 @@ system_call:			/* label this so stack traces look sane */
 	li	r11,-MAX_ERRNO
 	andi.	r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK)
 	bne-	syscall_exit_work
-	cmpld	r3,r11
+
+	andi.	r0,r8,MSR_FP
+	beq 2f
+#ifdef CONFIG_ALTIVEC
+	andis.	r0,r8,MSR_VEC@h
+	bne	3f
+#endif
+2:	addi    r3,r1,STACK_FRAME_OVERHEAD
+	bl	restore_math
+	ld	r8,_MSR(r1)
+	ld	r3,RESULT(r1)
+	li	r11,-MAX_ERRNO
+
+3:	cmpld	r3,r11
 	ld	r5,_CCR(r1)
 	bge-	syscall_error
 .Lsyscall_error_cont:
@@ -602,8 +615,8 @@ _GLOBAL(ret_from_except_lite)
 
 	/* Check current_thread_info()->flags */
 	andi.	r0,r4,_TIF_USER_WORK_MASK
-#ifdef CONFIG_PPC_BOOK3E
 	bne	1f
+#ifdef CONFIG_PPC_BOOK3E
 	/*
 	 * Check to see if the dbcr0 register is set up to debug.
 	 * Use the internal debug mode bit to do this.
@@ -618,7 +631,9 @@ _GLOBAL(ret_from_except_lite)
 	mtspr	SPRN_DBSR,r10
 	b	restore
 #else
-	beq	restore
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	restore_math
+	b	restore
 #endif
 1:	andi.	r0,r4,_TIF_NEED_RESCHED
 	beq	2f
diff --git a/arch/powerpc/kernel/fpu.S b/arch/powerpc/kernel/fpu.S
index 2117eaca3d28..15da2b5df85e 100644
--- a/arch/powerpc/kernel/fpu.S
+++ b/arch/powerpc/kernel/fpu.S
@@ -130,6 +130,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 	or	r12,r12,r4
 	std	r12,_MSR(r1)
 #endif
+	/* Don't care if r4 overflows, this is desired behaviour */
+	lbz	r4,THREAD_LOAD_FP(r5)
+	addi	r4,r4,1
+	stb	r4,THREAD_LOAD_FP(r5)
 	addi	r10,r5,THREAD_FPSTATE
 	lfd	fr0,FPSTATE_FPSCR(r10)
 	MTFSF_L(fr0)
@@ -139,33 +143,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 	blr
 
 /*
- * __giveup_fpu(tsk)
- * Disable FP for the task given as the argument,
- * and save the floating-point registers in its thread_struct.
+ * save_fpu(tsk)
+ * Save the floating-point registers in its thread_struct.
  * Enables the FPU for use in the kernel on return.
  */
-_GLOBAL(__giveup_fpu)
+_GLOBAL(save_fpu)
 	addi	r3,r3,THREAD	        /* want THREAD of task */
 	PPC_LL	r6,THREAD_FPSAVEAREA(r3)
 	PPC_LL	r5,PT_REGS(r3)
 	PPC_LCMPI	0,r6,0
 	bne	2f
 	addi	r6,r3,THREAD_FPSTATE
-2:	PPC_LCMPI	0,r5,0
-	SAVE_32FPVSRS(0, R4, R6)
+2:	SAVE_32FPVSRS(0, R4, R6)
 	mffs	fr0
 	stfd	fr0,FPSTATE_FPSCR(r6)
-	beq	1f
-	PPC_LL	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
-	li	r3,MSR_FP|MSR_FE0|MSR_FE1
-#ifdef CONFIG_VSX
-BEGIN_FTR_SECTION
-	oris	r3,r3,MSR_VSX@h
-END_FTR_SECTION_IFSET(CPU_FTR_VSX)
-#endif
-	andc	r4,r4,r3		/* disable FP for previous task */
-	PPC_STL	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
-1:
 	blr
 
 /*
diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S
index b5061abbd2e0..9cdf5c71e426 100644
--- a/arch/powerpc/kernel/head_44x.S
+++ b/arch/powerpc/kernel/head_44x.S
@@ -806,7 +806,7 @@ _GLOBAL(set_context)
 _GLOBAL(init_cpu_state)
 	mflr	r22
 #ifdef CONFIG_PPC_47x
-	/* We use the PVR to differenciate 44x cores from 476 */
+	/* We use the PVR to differentiate 44x cores from 476 */
 	mfspr	r3,SPRN_PVR
 	srwi	r3,r3,16
 	cmplwi	cr0,r3,PVR_476FPE@h
diff --git a/arch/powerpc/kernel/idle_power7.S b/arch/powerpc/kernel/idle_power7.S
index cf4fb5429cf1..470ceebd2d23 100644
--- a/arch/powerpc/kernel/idle_power7.S
+++ b/arch/powerpc/kernel/idle_power7.S
@@ -19,7 +19,7 @@
 #include <asm/kvm_book3s_asm.h>
 #include <asm/opal.h>
 #include <asm/cpuidle.h>
-#include <asm/mmu-hash64.h>
+#include <asm/book3s/64/mmu-hash.h>
 
 #undef DEBUG
 
diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c
index e77c3ccf8dcf..dbf098121ce6 100644
--- a/arch/powerpc/kernel/kgdb.c
+++ b/arch/powerpc/kernel/kgdb.c
@@ -445,7 +445,11 @@ int kgdb_arch_handle_exception(int vector, int signo, int err_code,
  * Global data
  */
 struct kgdb_arch arch_kgdb_ops = {
+#ifdef __LITTLE_ENDIAN__
+	.gdb_bpt_instr = {0x08, 0x10, 0x82, 0x7d},
+#else
 	.gdb_bpt_instr = {0x7d, 0x82, 0x10, 0x08},
+#endif
 };
 
 static int kgdb_not_implemented(struct pt_regs *regs)
diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c
index 2c647b1e62e4..ee62b197502d 100644
--- a/arch/powerpc/kernel/mce_power.c
+++ b/arch/powerpc/kernel/mce_power.c
@@ -54,8 +54,8 @@ static void flush_tlb_206(unsigned int num_sets, unsigned int action)
 }
 
 /*
- * Generic routine to flush TLB on power7. This routine is used as
- * flush_tlb hook in cpu_spec for Power7 processor.
+ * Generic routines to flush TLB on POWER processors. These routines
+ * are used as flush_tlb hook in the cpu_spec.
  *
  * action => TLB_INVAL_SCOPE_GLOBAL:  Invalidate all TLBs.
  *	     TLB_INVAL_SCOPE_LPID: Invalidate TLB for current LPID.
@@ -65,18 +65,17 @@ void __flush_tlb_power7(unsigned int action)
 	flush_tlb_206(POWER7_TLB_SETS, action);
 }
 
-/*
- * Generic routine to flush TLB on power8. This routine is used as
- * flush_tlb hook in cpu_spec for power8 processor.
- *
- * action => TLB_INVAL_SCOPE_GLOBAL:  Invalidate all TLBs.
- *	     TLB_INVAL_SCOPE_LPID: Invalidate TLB for current LPID.
- */
 void __flush_tlb_power8(unsigned int action)
 {
 	flush_tlb_206(POWER8_TLB_SETS, action);
 }
 
+void __flush_tlb_power9(unsigned int action)
+{
+	flush_tlb_206(POWER9_TLB_SETS_HASH, action);
+}
+
+
 /* flush SLBs and reload */
 static void flush_and_reload_slb(void)
 {
diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
index 848b47499a27..9ce9a25f58b5 100644
--- a/arch/powerpc/kernel/module_64.c
+++ b/arch/powerpc/kernel/module_64.c
@@ -311,7 +311,7 @@ static void dedotify(Elf64_Sym *syms, unsigned int numsyms, char *strtab)
 			if (name[0] == '.') {
 				if (strcmp(name+1, "TOC.") == 0)
 					syms[i].st_shndx = SHN_ABS;
-				memmove(name, name+1, strlen(name));
+				syms[i].st_name++;
 			}
 		}
 	}
diff --git a/arch/powerpc/kernel/pci-hotplug.c b/arch/powerpc/kernel/pci-hotplug.c
index 7f9ed0c1f6b9..59c436189f46 100644
--- a/arch/powerpc/kernel/pci-hotplug.c
+++ b/arch/powerpc/kernel/pci-hotplug.c
@@ -55,7 +55,7 @@ void pcibios_remove_pci_devices(struct pci_bus *bus)
 
 	pr_debug("PCI: Removing devices on bus %04x:%02x\n",
 		 pci_domain_nr(bus),  bus->number);
-	list_for_each_entry_safe(dev, tmp, &bus->devices, bus_list) {
+	list_for_each_entry_safe_reverse(dev, tmp, &bus->devices, bus_list) {
 		pr_debug("   Removing %s...\n", pci_name(dev));
 		pci_stop_and_remove_bus_device(dev);
 	}
diff --git a/arch/powerpc/kernel/pci_dn.c b/arch/powerpc/kernel/pci_dn.c
index b3b4df91b792..38102cb9baa9 100644
--- a/arch/powerpc/kernel/pci_dn.c
+++ b/arch/powerpc/kernel/pci_dn.c
@@ -139,6 +139,7 @@ struct pci_dn *pci_get_pdn(struct pci_dev *pdev)
 #ifdef CONFIG_PCI_IOV
 static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent,
 					   struct pci_dev *pdev,
+					   int vf_index,
 					   int busno, int devfn)
 {
 	struct pci_dn *pdn;
@@ -158,6 +159,7 @@ static struct pci_dn *add_one_dev_pci_data(struct pci_dn *parent,
 	pdn->busno = busno;
 	pdn->devfn = devfn;
 #ifdef CONFIG_PPC_POWERNV
+	pdn->vf_index = vf_index;
 	pdn->pe_number = IODA_INVALID_PE;
 #endif
 	INIT_LIST_HEAD(&pdn->child_list);
@@ -179,6 +181,7 @@ struct pci_dn *add_dev_pci_data(struct pci_dev *pdev)
 {
 #ifdef CONFIG_PCI_IOV
 	struct pci_dn *parent, *pdn;
+	struct eeh_dev *edev;
 	int i;
 
 	/* Only support IOV for now */
@@ -196,7 +199,7 @@ struct pci_dn *add_dev_pci_data(struct pci_dev *pdev)
 		return NULL;
 
 	for (i = 0; i < pci_sriov_get_totalvfs(pdev); i++) {
-		pdn = add_one_dev_pci_data(parent, NULL,
+		pdn = add_one_dev_pci_data(parent, NULL, i,
 					   pci_iov_virtfn_bus(pdev, i),
 					   pci_iov_virtfn_devfn(pdev, i));
 		if (!pdn) {
@@ -204,6 +207,12 @@ struct pci_dn *add_dev_pci_data(struct pci_dev *pdev)
 				 __func__, i);
 			return NULL;
 		}
+
+		/* Create the EEH device for the VF */
+		eeh_dev_init(pdn, pci_bus_to_host(pdev->bus));
+		edev = pdn_to_eeh_dev(pdn);
+		BUG_ON(!edev);
+		edev->physfn = pdev;
 	}
 #endif /* CONFIG_PCI_IOV */
 
@@ -215,6 +224,7 @@ void remove_dev_pci_data(struct pci_dev *pdev)
 #ifdef CONFIG_PCI_IOV
 	struct pci_dn *parent;
 	struct pci_dn *pdn, *tmp;
+	struct eeh_dev *edev;
 	int i;
 
 	/*
@@ -256,6 +266,13 @@ void remove_dev_pci_data(struct pci_dev *pdev)
 			    pdn->devfn != pci_iov_virtfn_devfn(pdev, i))
 				continue;
 
+			/* Release EEH device for the VF */
+			edev = pdn_to_eeh_dev(pdn);
+			if (edev) {
+				pdn->edev = NULL;
+				kfree(edev);
+			}
+
 			if (!list_empty(&pdn->list))
 				list_del(&pdn->list);
 
diff --git a/arch/powerpc/kernel/ppc_ksyms.c b/arch/powerpc/kernel/ppc_ksyms.c
index 41e1607e800c..ef7024dacff7 100644
--- a/arch/powerpc/kernel/ppc_ksyms.c
+++ b/arch/powerpc/kernel/ppc_ksyms.c
@@ -28,10 +28,6 @@ EXPORT_SYMBOL(load_vr_state);
 EXPORT_SYMBOL(store_vr_state);
 #endif
 
-#ifdef CONFIG_VSX
-EXPORT_SYMBOL_GPL(__giveup_vsx);
-#endif
-
 #ifdef CONFIG_EPAPR_PARAVIRT
 EXPORT_SYMBOL(epapr_hypercall_start);
 #endif
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index dccc87e8fee5..d7a9df51b974 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -133,6 +133,16 @@ void __msr_check_and_clear(unsigned long bits)
 EXPORT_SYMBOL(__msr_check_and_clear);
 
 #ifdef CONFIG_PPC_FPU
+void __giveup_fpu(struct task_struct *tsk)
+{
+	save_fpu(tsk);
+	tsk->thread.regs->msr &= ~MSR_FP;
+#ifdef CONFIG_VSX
+	if (cpu_has_feature(CPU_FTR_VSX))
+		tsk->thread.regs->msr &= ~MSR_VSX;
+#endif
+}
+
 void giveup_fpu(struct task_struct *tsk)
 {
 	check_if_tm_restore_required(tsk);
@@ -187,9 +197,32 @@ void enable_kernel_fp(void)
 	}
 }
 EXPORT_SYMBOL(enable_kernel_fp);
+
+static int restore_fp(struct task_struct *tsk) {
+	if (tsk->thread.load_fp) {
+		load_fp_state(&current->thread.fp_state);
+		current->thread.load_fp++;
+		return 1;
+	}
+	return 0;
+}
+#else
+static int restore_fp(struct task_struct *tsk) { return 0; }
 #endif /* CONFIG_PPC_FPU */
 
 #ifdef CONFIG_ALTIVEC
+#define loadvec(thr) ((thr).load_vec)
+
+static void __giveup_altivec(struct task_struct *tsk)
+{
+	save_altivec(tsk);
+	tsk->thread.regs->msr &= ~MSR_VEC;
+#ifdef CONFIG_VSX
+	if (cpu_has_feature(CPU_FTR_VSX))
+		tsk->thread.regs->msr &= ~MSR_VSX;
+#endif
+}
+
 void giveup_altivec(struct task_struct *tsk)
 {
 	check_if_tm_restore_required(tsk);
@@ -229,22 +262,49 @@ void flush_altivec_to_thread(struct task_struct *tsk)
 	}
 }
 EXPORT_SYMBOL_GPL(flush_altivec_to_thread);
+
+static int restore_altivec(struct task_struct *tsk)
+{
+	if (cpu_has_feature(CPU_FTR_ALTIVEC) && tsk->thread.load_vec) {
+		load_vr_state(&tsk->thread.vr_state);
+		tsk->thread.used_vr = 1;
+		tsk->thread.load_vec++;
+
+		return 1;
+	}
+	return 0;
+}
+#else
+#define loadvec(thr) 0
+static inline int restore_altivec(struct task_struct *tsk) { return 0; }
 #endif /* CONFIG_ALTIVEC */
 
 #ifdef CONFIG_VSX
-void giveup_vsx(struct task_struct *tsk)
+static void __giveup_vsx(struct task_struct *tsk)
 {
-	check_if_tm_restore_required(tsk);
-
-	msr_check_and_set(MSR_FP|MSR_VEC|MSR_VSX);
 	if (tsk->thread.regs->msr & MSR_FP)
 		__giveup_fpu(tsk);
 	if (tsk->thread.regs->msr & MSR_VEC)
 		__giveup_altivec(tsk);
+	tsk->thread.regs->msr &= ~MSR_VSX;
+}
+
+static void giveup_vsx(struct task_struct *tsk)
+{
+	check_if_tm_restore_required(tsk);
+
+	msr_check_and_set(MSR_FP|MSR_VEC|MSR_VSX);
 	__giveup_vsx(tsk);
 	msr_check_and_clear(MSR_FP|MSR_VEC|MSR_VSX);
 }
-EXPORT_SYMBOL(giveup_vsx);
+
+static void save_vsx(struct task_struct *tsk)
+{
+	if (tsk->thread.regs->msr & MSR_FP)
+		save_fpu(tsk);
+	if (tsk->thread.regs->msr & MSR_VEC)
+		save_altivec(tsk);
+}
 
 void enable_kernel_vsx(void)
 {
@@ -275,6 +335,19 @@ void flush_vsx_to_thread(struct task_struct *tsk)
 	}
 }
 EXPORT_SYMBOL_GPL(flush_vsx_to_thread);
+
+static int restore_vsx(struct task_struct *tsk)
+{
+	if (cpu_has_feature(CPU_FTR_VSX)) {
+		tsk->thread.used_vsr = 1;
+		return 1;
+	}
+
+	return 0;
+}
+#else
+static inline int restore_vsx(struct task_struct *tsk) { return 0; }
+static inline void save_vsx(struct task_struct *tsk) { }
 #endif /* CONFIG_VSX */
 
 #ifdef CONFIG_SPE
@@ -374,12 +447,76 @@ void giveup_all(struct task_struct *tsk)
 }
 EXPORT_SYMBOL(giveup_all);
 
+void restore_math(struct pt_regs *regs)
+{
+	unsigned long msr;
+
+	if (!current->thread.load_fp && !loadvec(current->thread))
+		return;
+
+	msr = regs->msr;
+	msr_check_and_set(msr_all_available);
+
+	/*
+	 * Only reload if the bit is not set in the user MSR, the bit BEING set
+	 * indicates that the registers are hot
+	 */
+	if ((!(msr & MSR_FP)) && restore_fp(current))
+		msr |= MSR_FP | current->thread.fpexc_mode;
+
+	if ((!(msr & MSR_VEC)) && restore_altivec(current))
+		msr |= MSR_VEC;
+
+	if ((msr & (MSR_FP | MSR_VEC)) == (MSR_FP | MSR_VEC) &&
+			restore_vsx(current)) {
+		msr |= MSR_VSX;
+	}
+
+	msr_check_and_clear(msr_all_available);
+
+	regs->msr = msr;
+}
+
+void save_all(struct task_struct *tsk)
+{
+	unsigned long usermsr;
+
+	if (!tsk->thread.regs)
+		return;
+
+	usermsr = tsk->thread.regs->msr;
+
+	if ((usermsr & msr_all_available) == 0)
+		return;
+
+	msr_check_and_set(msr_all_available);
+
+	/*
+	 * Saving the way the register space is in hardware, save_vsx boils
+	 * down to a save_fpu() and save_altivec()
+	 */
+	if (usermsr & MSR_VSX) {
+		save_vsx(tsk);
+	} else {
+		if (usermsr & MSR_FP)
+			save_fpu(tsk);
+
+		if (usermsr & MSR_VEC)
+			save_altivec(tsk);
+	}
+
+	if (usermsr & MSR_SPE)
+		__giveup_spe(tsk);
+
+	msr_check_and_clear(msr_all_available);
+}
+
 void flush_all_to_thread(struct task_struct *tsk)
 {
 	if (tsk->thread.regs) {
 		preempt_disable();
 		BUG_ON(tsk != current);
-		giveup_all(tsk);
+		save_all(tsk);
 
 #ifdef CONFIG_SPE
 		if (tsk->thread.regs->msr & MSR_SPE)
@@ -832,17 +969,9 @@ void restore_tm_state(struct pt_regs *regs)
 
 	msr_diff = current->thread.ckpt_regs.msr & ~regs->msr;
 	msr_diff &= MSR_FP | MSR_VEC | MSR_VSX;
-	if (msr_diff & MSR_FP) {
-		msr_check_and_set(MSR_FP);
-		load_fp_state(&current->thread.fp_state);
-		msr_check_and_clear(MSR_FP);
-		regs->msr |= current->thread.fpexc_mode;
-	}
-	if (msr_diff & MSR_VEC) {
-		msr_check_and_set(MSR_VEC);
-		load_vr_state(&current->thread.vr_state);
-		msr_check_and_clear(MSR_VEC);
-	}
+
+	restore_math(regs);
+
 	regs->msr |= msr_diff;
 }
 
@@ -1006,6 +1135,10 @@ struct task_struct *__switch_to(struct task_struct *prev,
 		batch = this_cpu_ptr(&ppc64_tlb_batch);
 		batch->active = 1;
 	}
+
+	if (current_thread_info()->task->thread.regs)
+		restore_math(current_thread_info()->task->thread.regs);
+
 #endif /* CONFIG_PPC_BOOK3S_64 */
 
 	return last;
@@ -1307,6 +1440,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
 
 		f = ret_from_fork;
 	}
+	childregs->msr &= ~(MSR_FP|MSR_VEC|MSR_VSX);
 	sp -= STACK_FRAME_OVERHEAD;
 
 	/*
diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c
index cf8c7e4e0b21..cb64d6feb45a 100644
--- a/arch/powerpc/kernel/signal.c
+++ b/arch/powerpc/kernel/signal.c
@@ -1,7 +1,7 @@
 /*
  * Common signal handling code for both 32 and 64 bits
  *
- *    Copyright (c) 2007 Benjamin Herrenschmidt, IBM Coproration
+ *    Copyright (c) 2007 Benjamin Herrenschmidt, IBM Corporation
  *    Extracted from signal_32.c and signal_64.c
  *
  * This file is subject to the terms and conditions of the GNU General
@@ -178,7 +178,7 @@ unsigned long get_tm_stackpointer(struct pt_regs *regs)
 	 * need to use the stack pointer from the checkpointed state, rather
 	 * than the speculated state.  This ensures that the signal context
 	 * (written tm suspended) will be written below the stack required for
-	 * the rollback.  The transaction is aborted becuase of the treclaim,
+	 * the rollback.  The transaction is aborted because of the treclaim,
 	 * so any memory written between the tbegin and the signal will be
 	 * rolled back anyway.
 	 *
diff --git a/arch/powerpc/kernel/signal.h b/arch/powerpc/kernel/signal.h
index 51b274199dd9..be305c858e51 100644
--- a/arch/powerpc/kernel/signal.h
+++ b/arch/powerpc/kernel/signal.h
@@ -1,5 +1,5 @@
 /*
- *    Copyright (c) 2007 Benjamin Herrenschmidt, IBM Coproration
+ *    Copyright (c) 2007 Benjamin Herrenschmidt, IBM Corporation
  *    Extracted from signal_32.c and signal_64.c
  *
  * This file is subject to the terms and conditions of the GNU General
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index b6becc795bb5..88414dde7e35 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -1148,6 +1148,7 @@ void __kprobes program_check_exception(struct pt_regs *regs)
 		goto bail;
 	}
 	if (reason & REASON_TRAP) {
+		unsigned long bugaddr;
 		/* Debugger is first in line to stop recursive faults in
 		 * rcu_lock, notify_die, or atomic_notifier_call_chain */
 		if (debugger_bpt(regs))
@@ -1158,8 +1159,15 @@ void __kprobes program_check_exception(struct pt_regs *regs)
 				== NOTIFY_STOP)
 			goto bail;
 
+		bugaddr = regs->nip;
+		/*
+		 * Fixup bugaddr for BUG_ON() in real mode
+		 */
+		if (!is_kernel_addr(bugaddr) && !(regs->msr & MSR_IR))
+			bugaddr += PAGE_OFFSET;
+
 		if (!(regs->msr & MSR_PR) &&  /* not user-mode */
-		    report_bug(regs->nip, regs) == BUG_TRAP_TYPE_WARN) {
+		    report_bug(bugaddr, regs) == BUG_TRAP_TYPE_WARN) {
 			regs->nip += 4;
 			goto bail;
 		}
@@ -1394,7 +1402,7 @@ void facility_unavailable_exception(struct pt_regs *regs)
 		 * is a read DSCR attempt through a mfspr instruction, we
 		 * just emulate the instruction instead. This code path will
 		 * always emulate all the mfspr instructions till the user
-		 * has attempted atleast one mtspr instruction. This way it
+		 * has attempted at least one mtspr instruction. This way it
 		 * preserves the same behaviour when the user is accessing
 		 * the DSCR through privilege level only SPR number (0x11)
 		 * which is emulated through illegal instruction exception.
diff --git a/arch/powerpc/kernel/vector.S b/arch/powerpc/kernel/vector.S
index 162d0f714941..1c2e7a343bf5 100644
--- a/arch/powerpc/kernel/vector.S
+++ b/arch/powerpc/kernel/vector.S
@@ -91,6 +91,10 @@ _GLOBAL(load_up_altivec)
 	oris	r12,r12,MSR_VEC@h
 	std	r12,_MSR(r1)
 #endif
+	/* Don't care if r4 overflows, this is desired behaviour */
+	lbz	r4,THREAD_LOAD_VEC(r5)
+	addi	r4,r4,1
+	stb	r4,THREAD_LOAD_VEC(r5)
 	addi	r6,r5,THREAD_VRSTATE
 	li	r4,1
 	li	r10,VRSTATE_VSCR
@@ -102,36 +106,20 @@ _GLOBAL(load_up_altivec)
 	blr
 
 /*
- * __giveup_altivec(tsk)
- * Disable VMX for the task given as the argument,
- * and save the vector registers in its thread_struct.
+ * save_altivec(tsk)
+ * Save the vector registers to its thread_struct
  */
-_GLOBAL(__giveup_altivec)
+_GLOBAL(save_altivec)
 	addi	r3,r3,THREAD		/* want THREAD of task */
 	PPC_LL	r7,THREAD_VRSAVEAREA(r3)
 	PPC_LL	r5,PT_REGS(r3)
 	PPC_LCMPI	0,r7,0
 	bne	2f
 	addi	r7,r3,THREAD_VRSTATE
-2:	PPC_LCMPI	0,r5,0
-	SAVE_32VRS(0,r4,r7)
+2:	SAVE_32VRS(0,r4,r7)
 	mfvscr	v0
 	li	r4,VRSTATE_VSCR
 	stvx	v0,r4,r7
-	beq	1f
-	PPC_LL	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
-#ifdef CONFIG_VSX
-BEGIN_FTR_SECTION
-	lis	r3,(MSR_VEC|MSR_VSX)@h
-FTR_SECTION_ELSE
-	lis	r3,MSR_VEC@h
-ALT_FTR_SECTION_END_IFSET(CPU_FTR_VSX)
-#else
-	lis	r3,MSR_VEC@h
-#endif
-	andc	r4,r4,r3		/* disable FP for previous task */
-	PPC_STL	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
-1:
 	blr
 
 #ifdef CONFIG_VSX
@@ -163,23 +151,6 @@ _GLOBAL(load_up_vsx)
 	std	r12,_MSR(r1)
 	b	fast_exception_return
 
-/*
- * __giveup_vsx(tsk)
- * Disable VSX for the task given as the argument.
- * Does NOT save vsx registers.
- */
-_GLOBAL(__giveup_vsx)
-	addi	r3,r3,THREAD		/* want THREAD of task */
-	ld	r5,PT_REGS(r3)
-	cmpdi	0,r5,0
-	beq	1f
-	ld	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
-	lis	r3,MSR_VSX@h
-	andc	r4,r4,r3		/* disable VSX for previous task */
-	std	r4,_MSR-STACK_FRAME_OVERHEAD(r5)
-1:
-	blr
-
 #endif /* CONFIG_VSX */
 
 
diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c
index 55c4d51ea3e2..999106991a76 100644
--- a/arch/powerpc/kvm/book3s_32_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_32_mmu_host.c
@@ -22,7 +22,7 @@
 
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
-#include <asm/mmu-hash32.h>
+#include <asm/book3s/32/mmu-hash.h>
 #include <asm/machdep.h>
 #include <asm/mmu_context.h>
 #include <asm/hw_irq.h>
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index 9bf7031a67ff..b9131aa1aedf 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -26,7 +26,7 @@
 #include <asm/tlbflush.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
-#include <asm/mmu-hash64.h>
+#include <asm/book3s/64/mmu-hash.h>
 
 /* #define DEBUG_MMU */
 
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index 913cd2198fa6..114edace6cdd 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -23,7 +23,7 @@
 
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
-#include <asm/mmu-hash64.h>
+#include <asm/book3s/64/mmu-hash.h>
 #include <asm/machdep.h>
 #include <asm/mmu_context.h>
 #include <asm/hw_irq.h>
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index fb37290a57b4..c7b78d8336b2 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -32,7 +32,7 @@
 #include <asm/tlbflush.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
-#include <asm/mmu-hash64.h>
+#include <asm/book3s/64/mmu-hash.h>
 #include <asm/hvcall.h>
 #include <asm/synch.h>
 #include <asm/ppc-opcode.h>
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 54cf9bc94dad..9c3b76bb69d9 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -30,7 +30,7 @@
 #include <asm/tlbflush.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
-#include <asm/mmu-hash64.h>
+#include <asm/book3s/64/mmu-hash.h>
 #include <asm/hvcall.h>
 #include <asm/synch.h>
 #include <asm/ppc-opcode.h>
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 89e96b3e0039..039028d3ccb5 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -29,7 +29,7 @@
 #include <asm/tlbflush.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
-#include <asm/mmu-hash64.h>
+#include <asm/book3s/64/mmu-hash.h>
 #include <asm/hvcall.h>
 #include <asm/synch.h>
 #include <asm/ppc-opcode.h>
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 91700518bbf3..4cb8db05f3e5 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -17,7 +17,7 @@
 #include <asm/tlbflush.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
-#include <asm/mmu-hash64.h>
+#include <asm/book3s/64/mmu-hash.h>
 #include <asm/hvcall.h>
 #include <asm/synch.h>
 #include <asm/ppc-opcode.h>
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 6ee26de9a1de..c613fee0b9f7 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -27,7 +27,7 @@
 #include <asm/asm-offsets.h>
 #include <asm/exception-64s.h>
 #include <asm/kvm_book3s_asm.h>
-#include <asm/mmu-hash64.h>
+#include <asm/book3s/64/mmu-hash.h>
 #include <asm/tm.h>
 
 #define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM)
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index 905e94a1370f..46871d554057 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -432,7 +432,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 	 * the whole masked_pending business which is about not
 	 * losing interrupts that occur while masked.
 	 *
-	 * I don't differenciate normal deliveries and resends, this
+	 * I don't differentiate normal deliveries and resends, this
 	 * implementation will differ from PAPR and not lose such
 	 * interrupts.
 	 */
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 778ef86e187e..4d66f44a1657 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -992,7 +992,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	kvmppc_restart_interrupt(vcpu, exit_nr);
 
 	/*
-	 * get last instruction before beeing preempted
+	 * get last instruction before being preempted
 	 * TODO: for e6500 check also BOOKE_INTERRUPT_LRAT_ERROR & ESR_DATA
 	 */
 	switch (exit_nr) {
diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c
index cda695de8aa7..f48a0c22e8f9 100644
--- a/arch/powerpc/kvm/e500mc.c
+++ b/arch/powerpc/kvm/e500mc.c
@@ -182,7 +182,7 @@ int kvmppc_core_check_processor_compat(void)
 		r = 0;
 #ifdef CONFIG_ALTIVEC
 	/*
-	 * Since guests have the priviledge to enable AltiVec, we need AltiVec
+	 * Since guests have the privilege to enable AltiVec, we need AltiVec
 	 * support in the host to save/restore their context.
 	 * Don't use CPU_FTR_ALTIVEC to identify cores with AltiVec unit
 	 * because it's cleared in the absence of CONFIG_ALTIVEC!
diff --git a/arch/powerpc/mm/hash64_4k.c b/arch/powerpc/mm/hash64_4k.c
index e7c04542ba62..47d1b26effc6 100644
--- a/arch/powerpc/mm/hash64_4k.c
+++ b/arch/powerpc/mm/hash64_4k.c
@@ -44,7 +44,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
 		 * a write access. Since this is 4K insert of 64K page size
 		 * also add _PAGE_COMBO
 		 */
-		new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED | _PAGE_HASHPTE;
+		new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
 		if (access & _PAGE_RW)
 			new_pte |= _PAGE_DIRTY;
 	} while (old_pte != __cmpxchg_u64((unsigned long *)ptep,
@@ -106,7 +106,7 @@ repeat:
 			}
 		}
 		/*
-		 * Hypervisor failure. Restore old pmd and return -1
+		 * Hypervisor failure. Restore old pte and return -1
 		 * similar to __hash_page_*
 		 */
 		if (unlikely(slot == -2)) {
diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c
index 0762c1e08c88..b2d659cf51c6 100644
--- a/arch/powerpc/mm/hash64_64k.c
+++ b/arch/powerpc/mm/hash64_64k.c
@@ -111,7 +111,13 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid,
 	 */
 	if (!(old_pte & _PAGE_COMBO)) {
 		flush_hash_page(vpn, rpte, MMU_PAGE_64K, ssize, flags);
-		old_pte &= ~_PAGE_HASHPTE | _PAGE_F_GIX | _PAGE_F_SECOND;
+		/*
+		 * clear the old slot details from the old and new pte.
+		 * On hash insert failure we use old pte value and we don't
+		 * want slot information there if we have a insert failure.
+		 */
+		old_pte &= ~(_PAGE_HASHPTE | _PAGE_F_GIX | _PAGE_F_SECOND);
+		new_pte &= ~(_PAGE_HASHPTE | _PAGE_F_GIX | _PAGE_F_SECOND);
 		goto htab_insert_hpte;
 	}
 	/*
@@ -182,7 +188,7 @@ repeat:
 		}
 	}
 	/*
-	 * Hypervisor failure. Restore old pmd and return -1
+	 * Hypervisor failure. Restore old pte and return -1
 	 * similar to __hash_page_*
 	 */
 	if (unlikely(slot == -2)) {
@@ -243,8 +249,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access,
 			return 0;
 		/*
 		 * Try to lock the PTE, add ACCESSED and DIRTY if it was
-		 * a write access. Since this is 4K insert of 64K page size
-		 * also add _PAGE_COMBO
+		 * a write access.
 		 */
 		new_pte = old_pte | _PAGE_BUSY | _PAGE_ACCESSED;
 		if (access & _PAGE_RW)
@@ -305,7 +310,7 @@ repeat:
 			}
 		}
 		/*
-		 * Hypervisor failure. Restore old pmd and return -1
+		 * Hypervisor failure. Restore old pte and return -1
 		 * similar to __hash_page_*
 		 */
 		if (unlikely(slot == -2)) {
diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c
index ba59d5977f34..90dd9280894f 100644
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -168,11 +168,11 @@ unsigned long htab_convert_pte_flags(unsigned long pteflags)
 		rflags |= HPTE_R_N;
 	/*
 	 * PP bits:
-	 * Linux use slb key 0 for kernel and 1 for user.
-	 * kernel areas are mapped by PP bits 00
-	 * and and there is no kernel RO (_PAGE_KERNEL_RO).
-	 * User area mapped by 0x2 and read only use by
-	 * 0x3.
+	 * Linux uses slb key 0 for kernel and 1 for user.
+	 * kernel areas are mapped with PP=00
+	 * and there is no kernel RO (_PAGE_KERNEL_RO).
+	 * User area is mapped with PP=0x2 for read/write
+	 * or PP=0x3 for read-only (including writeable but clean pages).
 	 */
 	if (pteflags & _PAGE_USER) {
 		rflags |= 0x2;
@@ -263,28 +263,32 @@ int htab_bolt_mapping(unsigned long vstart, unsigned long vend,
 	return ret < 0 ? ret : 0;
 }
 
-#ifdef CONFIG_MEMORY_HOTPLUG
 int htab_remove_mapping(unsigned long vstart, unsigned long vend,
 		      int psize, int ssize)
 {
 	unsigned long vaddr;
 	unsigned int step, shift;
+	int rc;
+	int ret = 0;
 
 	shift = mmu_psize_defs[psize].shift;
 	step = 1 << shift;
 
-	if (!ppc_md.hpte_removebolted) {
-		printk(KERN_WARNING "Platform doesn't implement "
-				"hpte_removebolted\n");
-		return -EINVAL;
-	}
+	if (!ppc_md.hpte_removebolted)
+		return -ENODEV;
 
-	for (vaddr = vstart; vaddr < vend; vaddr += step)
-		ppc_md.hpte_removebolted(vaddr, psize, ssize);
+	for (vaddr = vstart; vaddr < vend; vaddr += step) {
+		rc = ppc_md.hpte_removebolted(vaddr, psize, ssize);
+		if (rc == -ENOENT) {
+			ret = -ENOENT;
+			continue;
+		}
+		if (rc < 0)
+			return rc;
+	}
 
-	return 0;
+	return ret;
 }
-#endif /* CONFIG_MEMORY_HOTPLUG */
 
 static int __init htab_dt_scan_seg_sizes(unsigned long node,
 					 const char *uname, int depth,
@@ -605,10 +609,28 @@ static int __init htab_dt_scan_pftsize(unsigned long node,
 	return 0;
 }
 
-static unsigned long __init htab_get_table_size(void)
+unsigned htab_shift_for_mem_size(unsigned long mem_size)
 {
-	unsigned long mem_size, rnd_mem_size, pteg_count, psize;
+	unsigned memshift = __ilog2(mem_size);
+	unsigned pshift = mmu_psize_defs[mmu_virtual_psize].shift;
+	unsigned pteg_shift;
+
+	/* round mem_size up to next power of 2 */
+	if ((1UL << memshift) < mem_size)
+		memshift += 1;
+
+	/* aim for 2 pages / pteg */
+	pteg_shift = memshift - (pshift + 1);
 
+	/*
+	 * 2^11 PTEGS of 128 bytes each, ie. 2^18 bytes is the minimum htab
+	 * size permitted by the architecture.
+	 */
+	return max(pteg_shift + 7, 18U);
+}
+
+static unsigned long __init htab_get_table_size(void)
+{
 	/* If hash size isn't already provided by the platform, we try to
 	 * retrieve it from the device-tree. If it's not there neither, we
 	 * calculate it now based on the total RAM size
@@ -618,31 +640,30 @@ static unsigned long __init htab_get_table_size(void)
 	if (ppc64_pft_size)
 		return 1UL << ppc64_pft_size;
 
-	/* round mem_size up to next power of 2 */
-	mem_size = memblock_phys_mem_size();
-	rnd_mem_size = 1UL << __ilog2(mem_size);
-	if (rnd_mem_size < mem_size)
-		rnd_mem_size <<= 1;
-
-	/* # pages / 2 */
-	psize = mmu_psize_defs[mmu_virtual_psize].shift;
-	pteg_count = max(rnd_mem_size >> (psize + 1), 1UL << 11);
-
-	return pteg_count << 7;
+	return 1UL << htab_shift_for_mem_size(memblock_phys_mem_size());
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 int create_section_mapping(unsigned long start, unsigned long end)
 {
-	return htab_bolt_mapping(start, end, __pa(start),
-				 pgprot_val(PAGE_KERNEL), mmu_linear_psize,
-				 mmu_kernel_ssize);
+	int rc = htab_bolt_mapping(start, end, __pa(start),
+				   pgprot_val(PAGE_KERNEL), mmu_linear_psize,
+				   mmu_kernel_ssize);
+
+	if (rc < 0) {
+		int rc2 = htab_remove_mapping(start, end, mmu_linear_psize,
+					      mmu_kernel_ssize);
+		BUG_ON(rc2 && (rc2 != -ENOENT));
+	}
+	return rc;
 }
 
 int remove_section_mapping(unsigned long start, unsigned long end)
 {
-	return htab_remove_mapping(start, end, mmu_linear_psize,
-			mmu_kernel_ssize);
+	int rc = htab_remove_mapping(start, end, mmu_linear_psize,
+				     mmu_kernel_ssize);
+	WARN_ON(rc < 0);
+	return rc;
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
diff --git a/arch/powerpc/mm/hugepage-hash64.c b/arch/powerpc/mm/hugepage-hash64.c
index 49b152b0f926..eb2accdd76fd 100644
--- a/arch/powerpc/mm/hugepage-hash64.c
+++ b/arch/powerpc/mm/hugepage-hash64.c
@@ -78,9 +78,19 @@ int __hash_page_thp(unsigned long ea, unsigned long access, unsigned long vsid,
 		 * base page size. This is because demote_segment won't flush
 		 * hash page table entries.
 		 */
-		if ((old_pmd & _PAGE_HASHPTE) && !(old_pmd & _PAGE_COMBO))
+		if ((old_pmd & _PAGE_HASHPTE) && !(old_pmd & _PAGE_COMBO)) {
 			flush_hash_hugepage(vsid, ea, pmdp, MMU_PAGE_64K,
 					    ssize, flags);
+			/*
+			 * With THP, we also clear the slot information with
+			 * respect to all the 64K hash pte mapping the 16MB
+			 * page. They are all invalid now. This make sure we
+			 * don't find the slot valid when we fault with 4k
+			 * base page size.
+			 *
+			 */
+			memset(hpte_slot_array, 0, PTE_FRAG_SIZE);
+		}
 	}
 
 	valid = hpte_valid(hpte_slot_array, index);
diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c
index e2138c7ae70f..8555fce902fe 100644
--- a/arch/powerpc/mm/hugetlbpage-hash64.c
+++ b/arch/powerpc/mm/hugetlbpage-hash64.c
@@ -76,7 +76,7 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
 		if (old_pte & _PAGE_F_SECOND)
 			hash = ~hash;
 		slot = (hash & htab_hash_mask) * HPTES_PER_GROUP;
-		slot += (old_pte & _PAGE_F_GIX) >> 12;
+		slot += (old_pte & _PAGE_F_GIX) >> _PAGE_F_GIX_SHIFT;
 
 		if (ppc_md.hpte_updatepp(slot, rflags, vpn, mmu_psize,
 					 mmu_psize, ssize, flags) == -1)
@@ -105,7 +105,8 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid,
 			return -1;
 		}
 
-		new_pte |= (slot << 12) & (_PAGE_F_SECOND | _PAGE_F_GIX);
+		new_pte |= (slot << _PAGE_F_GIX_SHIFT) &
+			(_PAGE_F_SECOND | _PAGE_F_GIX);
 	}
 
 	/*
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c
index 744e24bcb85c..6dd272b6196f 100644
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -107,8 +107,7 @@ static int __hugepte_alloc(struct mm_struct *mm, hugepd_t *hpdp,
 		kmem_cache_free(cachep, new);
 	else {
 #ifdef CONFIG_PPC_BOOK3S_64
-		hpdp->pd = (unsigned long)new |
-			    (shift_to_mmu_psize(pshift) << 2);
+		hpdp->pd = __pa(new) | (shift_to_mmu_psize(pshift) << 2);
 #else
 		hpdp->pd = ((unsigned long)new & ~PD_HUGE) | pshift;
 #endif
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index 379a6a90644b..ba655666186d 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -85,6 +85,11 @@ static void pgd_ctor(void *addr)
 	memset(addr, 0, PGD_TABLE_SIZE);
 }
 
+static void pud_ctor(void *addr)
+{
+	memset(addr, 0, PUD_TABLE_SIZE);
+}
+
 static void pmd_ctor(void *addr)
 {
 	memset(addr, 0, PMD_TABLE_SIZE);
@@ -138,14 +143,18 @@ void pgtable_cache_init(void)
 {
 	pgtable_cache_add(PGD_INDEX_SIZE, pgd_ctor);
 	pgtable_cache_add(PMD_CACHE_INDEX, pmd_ctor);
+	/*
+	 * In all current configs, when the PUD index exists it's the
+	 * same size as either the pgd or pmd index except with THP enabled
+	 * on book3s 64
+	 */
+	if (PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE))
+		pgtable_cache_add(PUD_INDEX_SIZE, pud_ctor);
+
 	if (!PGT_CACHE(PGD_INDEX_SIZE) || !PGT_CACHE(PMD_CACHE_INDEX))
 		panic("Couldn't allocate pgtable caches");
-	/* In all current configs, when the PUD index exists it's the
-	 * same size as either the pgd or pmd index.  Verify that the
-	 * initialization above has also created a PUD cache.  This
-	 * will need re-examiniation if we add new possibilities for
-	 * the pagetable layout. */
-	BUG_ON(PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE));
+	if (PUD_INDEX_SIZE && !PGT_CACHE(PUD_INDEX_SIZE))
+		panic("Couldn't allocate pud pgtable caches");
 }
 
 #ifdef CONFIG_SPARSEMEM_VMEMMAP
@@ -188,9 +197,9 @@ static int __meminit vmemmap_populated(unsigned long start, int page_size)
  */
 
 #ifdef CONFIG_PPC_BOOK3E
-static void __meminit vmemmap_create_mapping(unsigned long start,
-					     unsigned long page_size,
-					     unsigned long phys)
+static int __meminit vmemmap_create_mapping(unsigned long start,
+					    unsigned long page_size,
+					    unsigned long phys)
 {
 	/* Create a PTE encoding without page size */
 	unsigned long i, flags = _PAGE_PRESENT | _PAGE_ACCESSED |
@@ -208,6 +217,8 @@ static void __meminit vmemmap_create_mapping(unsigned long start,
 	 */
 	for (i = 0; i < page_size; i += PAGE_SIZE)
 		BUG_ON(map_kernel_page(start + i, phys, flags));
+
+	return 0;
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
@@ -217,25 +228,31 @@ static void vmemmap_remove_mapping(unsigned long start,
 }
 #endif
 #else /* CONFIG_PPC_BOOK3E */
-static void __meminit vmemmap_create_mapping(unsigned long start,
-					     unsigned long page_size,
-					     unsigned long phys)
+static int __meminit vmemmap_create_mapping(unsigned long start,
+					    unsigned long page_size,
+					    unsigned long phys)
 {
-	int  mapped = htab_bolt_mapping(start, start + page_size, phys,
-					pgprot_val(PAGE_KERNEL),
-					mmu_vmemmap_psize,
-					mmu_kernel_ssize);
-	BUG_ON(mapped < 0);
+	int rc = htab_bolt_mapping(start, start + page_size, phys,
+				   pgprot_val(PAGE_KERNEL),
+				   mmu_vmemmap_psize, mmu_kernel_ssize);
+	if (rc < 0) {
+		int rc2 = htab_remove_mapping(start, start + page_size,
+					      mmu_vmemmap_psize,
+					      mmu_kernel_ssize);
+		BUG_ON(rc2 && (rc2 != -ENOENT));
+	}
+	return rc;
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 static void vmemmap_remove_mapping(unsigned long start,
 				   unsigned long page_size)
 {
-	int mapped = htab_remove_mapping(start, start + page_size,
-					 mmu_vmemmap_psize,
-					 mmu_kernel_ssize);
-	BUG_ON(mapped < 0);
+	int rc = htab_remove_mapping(start, start + page_size,
+				     mmu_vmemmap_psize,
+				     mmu_kernel_ssize);
+	BUG_ON((rc < 0) && (rc != -ENOENT));
+	WARN_ON(rc == -ENOENT);
 }
 #endif
 
@@ -303,6 +320,7 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
 
 	for (; start < end; start += page_size) {
 		void *p;
+		int rc;
 
 		if (vmemmap_populated(start, page_size))
 			continue;
@@ -316,7 +334,13 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
 		pr_debug("      * %016lx..%016lx allocated at %p\n",
 			 start, start + page_size, p);
 
-		vmemmap_create_mapping(start, page_size, __pa(p));
+		rc = vmemmap_create_mapping(start, page_size, __pa(p));
+		if (rc < 0) {
+			pr_warning(
+				"vmemmap_populate: Unable to create vmemmap mapping: %d\n",
+				rc);
+			return -EFAULT;
+		}
 	}
 
 	return 0;
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index d0f0a514b04e..f980da6d7569 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -119,12 +119,18 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device)
 	struct zone *zone;
 	unsigned long start_pfn = start >> PAGE_SHIFT;
 	unsigned long nr_pages = size >> PAGE_SHIFT;
+	int rc;
 
 	pgdata = NODE_DATA(nid);
 
 	start = (unsigned long)__va(start);
-	if (create_section_mapping(start, start + size))
-		return -EINVAL;
+	rc = create_section_mapping(start, start + size);
+	if (rc) {
+		pr_warning(
+			"Unable to create mapping for hot added memory 0x%llx..0x%llx: %d\n",
+			start, start + size, rc);
+		return -EFAULT;
+	}
 
 	/* this should work for most non-highmem platforms */
 	zone = pgdata->node_zones +
diff --git a/arch/powerpc/mm/mmu_decl.h b/arch/powerpc/mm/mmu_decl.h
index 9f58ff44a075..898d63365cdd 100644
--- a/arch/powerpc/mm/mmu_decl.h
+++ b/arch/powerpc/mm/mmu_decl.h
@@ -110,7 +110,8 @@ extern unsigned long Hash_size, Hash_mask;
 #endif /* CONFIG_PPC32 */
 
 #ifdef CONFIG_PPC64
-extern int map_kernel_page(unsigned long ea, unsigned long pa, int flags);
+extern int map_kernel_page(unsigned long ea, unsigned long pa,
+			   unsigned long flags);
 #endif /* CONFIG_PPC64 */
 
 extern unsigned long ioremap_bot;
diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c
index 3124a20d0fab..0eb53128ca2a 100644
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -88,7 +88,7 @@ static __ref void *early_alloc_pgtable(unsigned long size)
  * map_kernel_page adds an entry to the ioremap page table
  * and adds an entry to the HPT, possibly bolting it
  */
-int map_kernel_page(unsigned long ea, unsigned long pa, int flags)
+int map_kernel_page(unsigned long ea, unsigned long pa, unsigned long flags)
 {
 	pgd_t *pgdp;
 	pud_t *pudp;
@@ -646,6 +646,28 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
 	return pgtable;
 }
 
+void pmdp_huge_split_prepare(struct vm_area_struct *vma,
+			     unsigned long address, pmd_t *pmdp)
+{
+	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
+	VM_BUG_ON(REGION_ID(address) != USER_REGION_ID);
+
+	/*
+	 * We can't mark the pmd none here, because that will cause a race
+	 * against exit_mmap. We need to continue mark pmd TRANS HUGE, while
+	 * we spilt, but at the same time we wan't rest of the ppc64 code
+	 * not to insert hash pte on this, because we will be modifying
+	 * the deposited pgtable in the caller of this function. Hence
+	 * clear the _PAGE_USER so that we move the fault handling to
+	 * higher level function and that will serialize against ptl.
+	 * We need to flush existing hash pte entries here even though,
+	 * the translation is still valid, because we will withdraw
+	 * pgtable_t after this.
+	 */
+	pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_USER, 0);
+}
+
+
 /*
  * set a new huge pmd. We should not be called for updating
  * an existing pmd entry. That should go via pmd_hugepage_update.
@@ -663,10 +685,20 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 	return set_pte_at(mm, addr, pmdp_ptep(pmdp), pmd_pte(pmd));
 }
 
+/*
+ * We use this to invalidate a pmdp entry before switching from a
+ * hugepte to regular pmd entry.
+ */
 void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
 		     pmd_t *pmdp)
 {
 	pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0);
+
+	/*
+	 * This ensures that generic code that rely on IRQ disabling
+	 * to prevent a parallel THP split work as expected.
+	 */
+	kick_all_cpus_sync();
 }
 
 /*
@@ -717,7 +749,7 @@ pmd_t pfn_pmd(unsigned long pfn, pgprot_t pgprot)
 {
 	unsigned long pmdv;
 
-	pmdv = pfn << PTE_RPN_SHIFT;
+	pmdv = (pfn << PTE_RPN_SHIFT) & PTE_RPN_MASK;
 	return pmd_set_protbits(__pmd(pmdv), pgprot);
 }
 
@@ -785,6 +817,13 @@ pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
 
 int has_transparent_hugepage(void)
 {
+
+	BUILD_BUG_ON_MSG((PMD_SHIFT - PAGE_SHIFT) >= MAX_ORDER,
+		"hugepages can't be allocated by the buddy allocator");
+
+	BUILD_BUG_ON_MSG((PMD_SHIFT - PAGE_SHIFT) < 2,
+			 "We need more than 2 pages to do deferred thp split");
+
 	if (!mmu_has_feature(MMU_FTR_16M_PAGE))
 		return 0;
 	/*
diff --git a/arch/powerpc/mm/tlb_low_64e.S b/arch/powerpc/mm/tlb_low_64e.S
index 29d6987c37ba..eb82d787d99a 100644
--- a/arch/powerpc/mm/tlb_low_64e.S
+++ b/arch/powerpc/mm/tlb_low_64e.S
@@ -895,7 +895,7 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_USE_PAIRED_MAS)
 BEGIN_MMU_FTR_SECTION
 virt_page_table_tlb_miss_done:
 
-	/* We have overriden MAS2:EPN but currently our primary TLB miss
+	/* We have overridden MAS2:EPN but currently our primary TLB miss
 	 * handler will always restore it so that should not be an issue,
 	 * if we ever optimize the primary handler to not write MAS2 on
 	 * some cases, we'll have to restore MAS2:EPN here based on the
diff --git a/arch/powerpc/mm/tlb_nohash_low.S b/arch/powerpc/mm/tlb_nohash_low.S
index 68c477592e43..eabecfcaef7c 100644
--- a/arch/powerpc/mm/tlb_nohash_low.S
+++ b/arch/powerpc/mm/tlb_nohash_low.S
@@ -108,7 +108,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x)
 	blr
 2:
 #ifdef CONFIG_PPC_47x
-	oris	r7,r6,0x8000	/* specify way explicitely */
+	oris	r7,r6,0x8000	/* specify way explicitly */
 	clrrwi	r4,r3,12	/* get an EPN for the hashing with V = 0 */
 	ori	r4,r4,PPC47x_TLBE_SIZE
 	tlbwe   r4,r7,0		/* write it */
@@ -149,7 +149,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_47x)
 	li	r3,-1		/* Current set */
 	lis	r10,tlb_47x_boltmap@h
 	ori	r10,r10,tlb_47x_boltmap@l
-	lis	r7,0x8000	/* Specify way explicitely */
+	lis	r7,0x8000	/* Specify way explicitly */
 
 	b	9f		/* For each set */
 
diff --git a/arch/powerpc/oprofile/op_model_cell.c b/arch/powerpc/oprofile/op_model_cell.c
index 863d89386f60..c82497a31c54 100644
--- a/arch/powerpc/oprofile/op_model_cell.c
+++ b/arch/powerpc/oprofile/op_model_cell.c
@@ -208,7 +208,7 @@ static void pm_rtas_reset_signals(u32 node)
 
 	/*
 	 * The debug bus is being set to the passthru disable state.
-	 * However, the FW still expects atleast one legal signal routing
+	 * However, the FW still expects at least one legal signal routing
 	 * entry or it will return an error on the arguments.	If we don't
 	 * supply a valid entry, we must ignore all return values.  Ignoring
 	 * all return values means we might miss an error we should be
@@ -1008,7 +1008,7 @@ static int initial_lfsr[] = {
  *
  * To avoid the time to compute the LFSR, a lookup table is used.  The 24 bit
  * LFSR sequence is broken into four ranges.  The spacing of the precomputed
- * values is adjusted in each range so the error between the user specifed
+ * values is adjusted in each range so the error between the user specified
  * number (N) of events between samples and the actual number of events based
  * on the precomputed value will be les then about 6.2%.  Note, if the user
  * specifies N < 2^16, the LFSR value that is 2^16 from the end will be used.
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index d1e65ce545b3..97a1d40d8696 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -651,7 +651,7 @@ static void pmao_restore_workaround(bool ebb)
 
 	/*
 	 * We are already soft-disabled in power_pmu_enable(). We need to hard
-	 * enable to actually prevent the PMU exception from firing.
+	 * disable to actually prevent the PMU exception from firing.
 	 */
 	hard_irq_disable();
 
diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c
index 9f9dfda9ed2c..59012e750abe 100644
--- a/arch/powerpc/perf/hv-24x7.c
+++ b/arch/powerpc/perf/hv-24x7.c
@@ -27,20 +27,6 @@
 #include "hv-24x7-catalog.h"
 #include "hv-common.h"
 
-static const char *event_domain_suffix(unsigned domain)
-{
-	switch (domain) {
-#define DOMAIN(n, v, x, c)		\
-	case HV_PERF_DOMAIN_##n:	\
-		return "__" #n;
-#include "hv-24x7-domains.h"
-#undef DOMAIN
-	default:
-		WARN(1, "unknown domain %d\n", domain);
-		return "__UNKNOWN_DOMAIN_SUFFIX";
-	}
-}
-
 static bool domain_is_valid(unsigned domain)
 {
 	switch (domain) {
@@ -68,6 +54,24 @@ static bool is_physical_domain(unsigned domain)
 	}
 }
 
+static const char *domain_name(unsigned domain)
+{
+	if (!domain_is_valid(domain))
+		return NULL;
+
+	switch (domain) {
+	case HV_PERF_DOMAIN_PHYS_CHIP:		return "Physical Chip";
+	case HV_PERF_DOMAIN_PHYS_CORE:		return "Physical Core";
+	case HV_PERF_DOMAIN_VCPU_HOME_CORE:	return "VCPU Home Core";
+	case HV_PERF_DOMAIN_VCPU_HOME_CHIP:	return "VCPU Home Chip";
+	case HV_PERF_DOMAIN_VCPU_HOME_NODE:	return "VCPU Home Node";
+	case HV_PERF_DOMAIN_VCPU_REMOTE_NODE:	return "VCPU Remote Node";
+	}
+
+	WARN_ON_ONCE(domain);
+	return NULL;
+}
+
 static bool catalog_entry_domain_is_valid(unsigned domain)
 {
 	return is_physical_domain(domain);
@@ -101,6 +105,7 @@ static bool catalog_entry_domain_is_valid(unsigned domain)
 EVENT_DEFINE_RANGE_FORMAT(domain, config, 0, 3);
 /* u16 */
 EVENT_DEFINE_RANGE_FORMAT(core, config, 16, 31);
+EVENT_DEFINE_RANGE_FORMAT(chip, config, 16, 31);
 EVENT_DEFINE_RANGE_FORMAT(vcpu, config, 16, 31);
 /* u32, see "data_offset" */
 EVENT_DEFINE_RANGE_FORMAT(offset, config, 32, 63);
@@ -115,6 +120,7 @@ static struct attribute *format_attrs[] = {
 	&format_attr_domain.attr,
 	&format_attr_offset.attr,
 	&format_attr_core.attr,
+	&format_attr_chip.attr,
 	&format_attr_vcpu.attr,
 	&format_attr_lpar.attr,
 	NULL,
@@ -274,32 +280,70 @@ static unsigned long h_get_24x7_catalog_page(char page[],
 					version, index);
 }
 
-static unsigned core_domains[] = {
-	HV_PERF_DOMAIN_PHYS_CORE,
-	HV_PERF_DOMAIN_VCPU_HOME_CORE,
-	HV_PERF_DOMAIN_VCPU_HOME_CHIP,
-	HV_PERF_DOMAIN_VCPU_HOME_NODE,
-	HV_PERF_DOMAIN_VCPU_REMOTE_NODE,
-};
-/* chip event data always yeilds a single event, core yeilds multiple */
-#define MAX_EVENTS_PER_EVENT_DATA ARRAY_SIZE(core_domains)
-
+/*
+ * Each event we find in the catalog, will have a sysfs entry. Format the
+ * data for this sysfs entry based on the event's domain.
+ *
+ * Events belonging to the Chip domain can only be monitored in that domain.
+ * i.e the domain for these events is a fixed/knwon value.
+ *
+ * Events belonging to the Core domain can be monitored either in the physical
+ * core or in one of the virtual CPU domains. So the domain value for these
+ * events must be specified by the user (i.e is a required parameter). Format
+ * the Core events with 'domain=?' so the perf-tool can error check required
+ * parameters.
+ *
+ * NOTE: For the Core domain events, rather than making domain a required
+ *	 parameter we could default it to PHYS_CORE and allowe users to
+ *	 override the domain to one of the VCPU domains.
+ *
+ *	 However, this can make the interface a little inconsistent.
+ *
+ *	 If we set domain=2 (PHYS_CHIP) and allow user to override this field
+ *	 the user may be tempted to also modify the "offset=x" field in which
+ *	 can lead to confusing usage. Consider the HPM_PCYC (offset=0x18) and
+ *	 HPM_INST (offset=0x20) events. With:
+ *
+ *		perf stat -e hv_24x7/HPM_PCYC,offset=0x20/
+ *
+ *	we end up monitoring HPM_INST, while the command line has HPM_PCYC.
+ *
+ *	By not assigning a default value to the domain for the Core events,
+ *	we can have simple guidelines:
+ *
+ *		- Specifying values for parameters with "=?" is required.
+ *
+ *		- Specifying (i.e overriding) values for other parameters
+ *		  is undefined.
+ */
 static char *event_fmt(struct hv_24x7_event_data *event, unsigned domain)
 {
 	const char *sindex;
 	const char *lpar;
+	const char *domain_str;
+	char buf[8];
 
-	if (is_physical_domain(domain)) {
+	switch (domain) {
+	case HV_PERF_DOMAIN_PHYS_CHIP:
+		snprintf(buf, sizeof(buf), "%d", domain);
+		domain_str = buf;
+		lpar = "0x0";
+		sindex = "chip";
+		break;
+	case HV_PERF_DOMAIN_PHYS_CORE:
+		domain_str = "?";
 		lpar = "0x0";
 		sindex = "core";
-	} else {
+		break;
+	default:
+		domain_str = "?";
 		lpar = "?";
 		sindex = "vcpu";
 	}
 
 	return kasprintf(GFP_KERNEL,
-			"domain=0x%x,offset=0x%x,%s=?,lpar=%s",
-			domain,
+			"domain=%s,offset=0x%x,%s=?,lpar=%s",
+			domain_str,
 			be16_to_cpu(event->event_counter_offs) +
 				be16_to_cpu(event->event_group_record_offs),
 			sindex,
@@ -339,6 +383,15 @@ static struct attribute *device_str_attr_create_(char *name, char *str)
 	return &attr->attr.attr;
 }
 
+/*
+ * Allocate and initialize strings representing event attributes.
+ *
+ * NOTE: The strings allocated here are never destroyed and continue to
+ *	 exist till shutdown. This is to allow us to create as many events
+ *	 from the catalog as possible, even if we encounter errors with some.
+ *	 In case of changes to error paths in future, these may need to be
+ *	 freed by the caller.
+ */
 static struct attribute *device_str_attr_create(char *name, int name_max,
 						int name_nonce,
 						char *str, size_t str_max)
@@ -370,16 +423,6 @@ out_s:
 	return NULL;
 }
 
-static void device_str_attr_destroy(struct attribute *attr)
-{
-	struct dev_ext_attribute *d;
-
-	d = container_of(attr, struct dev_ext_attribute, attr.attr);
-	kfree(d->var);
-	kfree(d->attr.attr.name);
-	kfree(d);
-}
-
 static struct attribute *event_to_attr(unsigned ix,
 				       struct hv_24x7_event_data *event,
 				       unsigned domain,
@@ -387,7 +430,6 @@ static struct attribute *event_to_attr(unsigned ix,
 {
 	int event_name_len;
 	char *ev_name, *a_ev_name, *val;
-	const char *ev_suffix;
 	struct attribute *attr;
 
 	if (!domain_is_valid(domain)) {
@@ -400,14 +442,13 @@ static struct attribute *event_to_attr(unsigned ix,
 	if (!val)
 		return NULL;
 
-	ev_suffix = event_domain_suffix(domain);
 	ev_name = event_name(event, &event_name_len);
 	if (!nonce)
-		a_ev_name = kasprintf(GFP_KERNEL, "%.*s%s",
-				(int)event_name_len, ev_name, ev_suffix);
+		a_ev_name = kasprintf(GFP_KERNEL, "%.*s",
+				(int)event_name_len, ev_name);
 	else
-		a_ev_name = kasprintf(GFP_KERNEL, "%.*s%s__%d",
-				(int)event_name_len, ev_name, ev_suffix, nonce);
+		a_ev_name = kasprintf(GFP_KERNEL, "%.*s__%d",
+				(int)event_name_len, ev_name, nonce);
 
 	if (!a_ev_name)
 		goto out_val;
@@ -452,45 +493,14 @@ event_to_long_desc_attr(struct hv_24x7_event_data *event, int nonce)
 	return device_str_attr_create(name, nl, nonce, desc, dl);
 }
 
-static ssize_t event_data_to_attrs(unsigned ix, struct attribute **attrs,
+static int event_data_to_attrs(unsigned ix, struct attribute **attrs,
 				   struct hv_24x7_event_data *event, int nonce)
 {
-	unsigned i;
-
-	switch (event->domain) {
-	case HV_PERF_DOMAIN_PHYS_CHIP:
-		*attrs = event_to_attr(ix, event, event->domain, nonce);
-		return 1;
-	case HV_PERF_DOMAIN_PHYS_CORE:
-		for (i = 0; i < ARRAY_SIZE(core_domains); i++) {
-			attrs[i] = event_to_attr(ix, event, core_domains[i],
-						nonce);
-			if (!attrs[i]) {
-				pr_warn("catalog event %u: individual attr %u "
-					"creation failure\n", ix, i);
-				for (; i; i--)
-					device_str_attr_destroy(attrs[i - 1]);
-				return -1;
-			}
-		}
-		return i;
-	default:
-		pr_warn("catalog event %u: domain %u is not allowed in the "
-				"catalog\n", ix, event->domain);
+	*attrs = event_to_attr(ix, event, event->domain, nonce);
+	if (!*attrs)
 		return -1;
-	}
-}
 
-static size_t event_to_attr_ct(struct hv_24x7_event_data *event)
-{
-	switch (event->domain) {
-	case HV_PERF_DOMAIN_PHYS_CHIP:
-		return 1;
-	case HV_PERF_DOMAIN_PHYS_CORE:
-		return ARRAY_SIZE(core_domains);
-	default:
-		return 0;
-	}
+	return 0;
 }
 
 static unsigned long vmalloc_to_phys(void *v)
@@ -726,9 +736,8 @@ static int create_events_from_catalog(struct attribute ***events_,
 		goto e_free;
 	}
 
-	if (SIZE_MAX / MAX_EVENTS_PER_EVENT_DATA - 1 < event_entry_count) {
-		pr_err("event_entry_count %zu is invalid\n",
-				event_entry_count);
+	if (SIZE_MAX - 1 < event_entry_count) {
+		pr_err("event_entry_count %zu is invalid\n", event_entry_count);
 		ret = -EIO;
 		goto e_free;
 	}
@@ -801,7 +810,7 @@ static int create_events_from_catalog(struct attribute ***events_,
 			continue;
 		}
 
-		attr_max += event_to_attr_ct(event);
+		attr_max++;
 	}
 
 	event_idx_last = event_idx;
@@ -851,12 +860,12 @@ static int create_events_from_catalog(struct attribute ***events_,
 		nonce = event_uniq_add(&ev_uniq, name, nl, event->domain);
 		ct    = event_data_to_attrs(event_idx, events + event_attr_ct,
 					    event, nonce);
-		if (ct <= 0) {
+		if (ct < 0) {
 			pr_warn("event %zu (%.*s) creation failure, skipping\n",
 				event_idx, nl, name);
 			junk_events++;
 		} else {
-			event_attr_ct += ct;
+			event_attr_ct++;
 			event_descs[desc_ct] = event_to_desc_attr(event, nonce);
 			if (event_descs[desc_ct])
 				desc_ct++;
@@ -961,6 +970,27 @@ e_free:
 	return ret;
 }
 
+static ssize_t domains_show(struct device *dev, struct device_attribute *attr,
+			    char *page)
+{
+	int d, n, count = 0;
+	const char *str;
+
+	for (d = 0; d < HV_PERF_DOMAIN_MAX; d++) {
+		str = domain_name(d);
+		if (!str)
+			continue;
+
+		n = sprintf(page, "%d: %s\n", d, str);
+		if (n < 0)
+			break;
+
+		count += n;
+		page += n;
+	}
+	return count;
+}
+
 #define PAGE_0_ATTR(_name, _fmt, _expr)				\
 static ssize_t _name##_show(struct device *dev,			\
 			    struct device_attribute *dev_attr,	\
@@ -989,6 +1019,7 @@ PAGE_0_ATTR(catalog_version, "%lld\n",
 PAGE_0_ATTR(catalog_len, "%lld\n",
 		(unsigned long long)be32_to_cpu(page_0->length) * 4096);
 static BIN_ATTR_RO(catalog, 0/* real length varies */);
+static DEVICE_ATTR_RO(domains);
 
 static struct bin_attribute *if_bin_attrs[] = {
 	&bin_attr_catalog,
@@ -998,6 +1029,7 @@ static struct bin_attribute *if_bin_attrs[] = {
 static struct attribute *if_attrs[] = {
 	&dev_attr_catalog_len.attr,
 	&dev_attr_catalog_version.attr,
+	&dev_attr_domains.attr,
 	NULL,
 };
 
@@ -1089,10 +1121,16 @@ static int add_event_to_24x7_request(struct perf_event *event,
 		return -EINVAL;
 	}
 
-	if (is_physical_domain(event_get_domain(event)))
+	switch (event_get_domain(event)) {
+	case HV_PERF_DOMAIN_PHYS_CHIP:
+		idx = event_get_chip(event);
+		break;
+	case HV_PERF_DOMAIN_PHYS_CORE:
 		idx = event_get_core(event);
-	else
+		break;
+	default:
 		idx = event_get_vcpu(event);
+	}
 
 	i = request_buffer->num_requests++;
 	req = &request_buffer->requests[i];
@@ -1208,11 +1246,12 @@ static int h_24x7_event_init(struct perf_event *event)
 		return -EACCES;
 	}
 
-	/* see if the event complains */
+	/* Get the initial value of the counter for this event */
 	if (single_24x7_request(event, &ct)) {
 		pr_devel("test hcall failed\n");
 		return -EIO;
 	}
+	(void)local64_xchg(&event->hw.prev_count, ct);
 
 	return 0;
 }
@@ -1275,6 +1314,16 @@ static void h_24x7_event_read(struct perf_event *event)
 			h24x7hw = &get_cpu_var(hv_24x7_hw);
 			h24x7hw->events[i] = event;
 			put_cpu_var(h24x7hw);
+			/*
+			 * Clear the event count so we can compute the _change_
+			 * in the 24x7 raw counter value at the end of the txn.
+			 *
+			 * Note that we could alternatively read the 24x7 value
+			 * now and save its value in event->hw.prev_count. But
+			 * that would require issuing a hcall, which would then
+			 * defeat the purpose of using the txn interface.
+			 */
+			local64_set(&event->count, 0);
 		}
 
 		put_cpu_var(hv_24x7_reqb);
diff --git a/arch/powerpc/perf/hv-24x7.h b/arch/powerpc/perf/hv-24x7.h
index 0f9fa21a29f2..791455e7f5cf 100644
--- a/arch/powerpc/perf/hv-24x7.h
+++ b/arch/powerpc/perf/hv-24x7.h
@@ -7,6 +7,7 @@ enum hv_perf_domains {
 #define DOMAIN(n, v, x, c) HV_PERF_DOMAIN_##n = v,
 #include "hv-24x7-domains.h"
 #undef DOMAIN
+	HV_PERF_DOMAIN_MAX,
 };
 
 struct hv_24x7_request {
@@ -80,7 +81,7 @@ struct hv_24x7_result {
 	__u8 results_complete;
 	__be16 num_elements_returned;
 
-	/* This is a copy of @data_size from the coresponding hv_24x7_request */
+	/* This is a copy of @data_size from the corresponding hv_24x7_request */
 	__be16 result_element_data_size;
 	__u8 reserved[0x2];
 
diff --git a/arch/powerpc/perf/hv-gpci.c b/arch/powerpc/perf/hv-gpci.c
index 856fe6e03c2a..7aa37236bb70 100644
--- a/arch/powerpc/perf/hv-gpci.c
+++ b/arch/powerpc/perf/hv-gpci.c
@@ -127,8 +127,16 @@ static const struct attribute_group *attr_groups[] = {
 	NULL,
 };
 
-#define GPCI_MAX_DATA_BYTES \
-	(1024 - sizeof(struct hv_get_perf_counter_info_params))
+#define HGPCI_REQ_BUFFER_SIZE	4096
+#define HGPCI_MAX_DATA_BYTES \
+	(HGPCI_REQ_BUFFER_SIZE - sizeof(struct hv_get_perf_counter_info_params))
+
+DEFINE_PER_CPU(char, hv_gpci_reqb[HGPCI_REQ_BUFFER_SIZE]) __aligned(sizeof(uint64_t));
+
+struct hv_gpci_request_buffer {
+	struct hv_get_perf_counter_info_params params;
+	uint8_t bytes[HGPCI_MAX_DATA_BYTES];
+} __packed;
 
 static unsigned long single_gpci_request(u32 req, u32 starting_index,
 		u16 secondary_index, u8 version_in, u32 offset, u8 length,
@@ -137,24 +145,21 @@ static unsigned long single_gpci_request(u32 req, u32 starting_index,
 	unsigned long ret;
 	size_t i;
 	u64 count;
+	struct hv_gpci_request_buffer *arg;
+
+	arg = (void *)get_cpu_var(hv_gpci_reqb);
+	memset(arg, 0, HGPCI_REQ_BUFFER_SIZE);
 
-	struct {
-		struct hv_get_perf_counter_info_params params;
-		uint8_t bytes[GPCI_MAX_DATA_BYTES];
-	} __packed __aligned(sizeof(uint64_t)) arg = {
-		.params = {
-			.counter_request = cpu_to_be32(req),
-			.starting_index = cpu_to_be32(starting_index),
-			.secondary_index = cpu_to_be16(secondary_index),
-			.counter_info_version_in = version_in,
-		}
-	};
+	arg->params.counter_request = cpu_to_be32(req);
+	arg->params.starting_index = cpu_to_be32(starting_index);
+	arg->params.secondary_index = cpu_to_be16(secondary_index);
+	arg->params.counter_info_version_in = version_in;
 
 	ret = plpar_hcall_norets(H_GET_PERF_COUNTER_INFO,
-			virt_to_phys(&arg), sizeof(arg));
+			virt_to_phys(arg), HGPCI_REQ_BUFFER_SIZE);
 	if (ret) {
 		pr_devel("hcall failed: 0x%lx\n", ret);
-		return ret;
+		goto out;
 	}
 
 	/*
@@ -163,9 +168,11 @@ static unsigned long single_gpci_request(u32 req, u32 starting_index,
 	 */
 	count = 0;
 	for (i = offset; i < offset + length; i++)
-		count |= arg.bytes[i] << (i - offset);
+		count |= arg->bytes[i] << (i - offset);
 
 	*value = count;
+out:
+	put_cpu_var(hv_gpci_reqb);
 	return ret;
 }
 
@@ -245,10 +252,10 @@ static int h_gpci_event_init(struct perf_event *event)
 	}
 
 	/* last byte within the buffer? */
-	if ((event_get_offset(event) + length) > GPCI_MAX_DATA_BYTES) {
+	if ((event_get_offset(event) + length) > HGPCI_MAX_DATA_BYTES) {
 		pr_devel("request outside of buffer: %zu > %zu\n",
 				(size_t)event_get_offset(event) + length,
-				GPCI_MAX_DATA_BYTES);
+				HGPCI_MAX_DATA_BYTES);
 		return -EINVAL;
 	}
 
diff --git a/arch/powerpc/perf/power7-pmu.c b/arch/powerpc/perf/power7-pmu.c
index 5b62f2389290..a383c23a9070 100644
--- a/arch/powerpc/perf/power7-pmu.c
+++ b/arch/powerpc/perf/power7-pmu.c
@@ -54,7 +54,7 @@
  * Power7 event codes.
  */
 #define EVENT(_name, _code) \
-	PME_##_name = _code,
+	_name = _code,
 
 enum {
 #include "power7-events-list.h"
@@ -318,14 +318,14 @@ static void power7_disable_pmc(unsigned int pmc, unsigned long mmcr[])
 }
 
 static int power7_generic_events[] = {
-	[PERF_COUNT_HW_CPU_CYCLES] =			PME_PM_CYC,
-	[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =	PME_PM_GCT_NOSLOT_CYC,
-	[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =	PME_PM_CMPLU_STALL,
-	[PERF_COUNT_HW_INSTRUCTIONS] =			PME_PM_INST_CMPL,
-	[PERF_COUNT_HW_CACHE_REFERENCES] =		PME_PM_LD_REF_L1,
-	[PERF_COUNT_HW_CACHE_MISSES] =			PME_PM_LD_MISS_L1,
-	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =		PME_PM_BRU_FIN,
-	[PERF_COUNT_HW_BRANCH_MISSES] =			PME_PM_BR_MPRED,
+	[PERF_COUNT_HW_CPU_CYCLES] =			PM_CYC,
+	[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] =	PM_GCT_NOSLOT_CYC,
+	[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] =	PM_CMPLU_STALL,
+	[PERF_COUNT_HW_INSTRUCTIONS] =			PM_INST_CMPL,
+	[PERF_COUNT_HW_CACHE_REFERENCES] =		PM_LD_REF_L1,
+	[PERF_COUNT_HW_CACHE_MISSES] =			PM_LD_MISS_L1,
+	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =		PM_BRU_FIN,
+	[PERF_COUNT_HW_BRANCH_MISSES] =			PM_BR_MPRED,
 };
 
 #define C(x)	PERF_COUNT_HW_CACHE_##x
diff --git a/arch/powerpc/perf/power8-events-list.h b/arch/powerpc/perf/power8-events-list.h
new file mode 100644
index 000000000000..741b77edd03e
--- /dev/null
+++ b/arch/powerpc/perf/power8-events-list.h
@@ -0,0 +1,51 @@
+/*
+ * Performance counter support for POWER8 processors.
+ *
+ * Copyright 2014 Sukadev Bhattiprolu, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+/*
+ * Power8 event codes.
+ */
+EVENT(PM_CYC,					0x0001e)
+EVENT(PM_GCT_NOSLOT_CYC,			0x100f8)
+EVENT(PM_CMPLU_STALL,				0x4000a)
+EVENT(PM_INST_CMPL,				0x00002)
+EVENT(PM_BRU_FIN,				0x10068)
+EVENT(PM_BR_MPRED_CMPL,				0x400f6)
+
+/* All L1 D cache load references counted at finish, gated by reject */
+EVENT(PM_LD_REF_L1,				0x100ee)
+/* Load Missed L1 */
+EVENT(PM_LD_MISS_L1,				0x3e054)
+/* Store Missed L1 */
+EVENT(PM_ST_MISS_L1,				0x300f0)
+/* L1 cache data prefetches */
+EVENT(PM_L1_PREF,				0x0d8b8)
+/* Instruction fetches from L1 */
+EVENT(PM_INST_FROM_L1,				0x04080)
+/* Demand iCache Miss */
+EVENT(PM_L1_ICACHE_MISS,			0x200fd)
+/* Instruction Demand sectors wriittent into IL1 */
+EVENT(PM_L1_DEMAND_WRITE,			0x0408c)
+/* Instruction prefetch written into IL1 */
+EVENT(PM_IC_PREF_WRITE,				0x0408e)
+/* The data cache was reloaded from local core's L3 due to a demand load */
+EVENT(PM_DATA_FROM_L3,				0x4c042)
+/* Demand LD - L3 Miss (not L2 hit and not L3 hit) */
+EVENT(PM_DATA_FROM_L3MISS,			0x300fe)
+/* All successful D-side store dispatches for this thread */
+EVENT(PM_L2_ST,					0x17080)
+/* All successful D-side store dispatches for this thread that were L2 Miss */
+EVENT(PM_L2_ST_MISS,				0x17082)
+/* Total HW L3 prefetches(Load+store) */
+EVENT(PM_L3_PREF_ALL,				0x4e052)
+/* Data PTEG reload */
+EVENT(PM_DTLB_MISS,				0x300fc)
+/* ITLB Reloaded */
+EVENT(PM_ITLB_MISS,				0x400fc)
diff --git a/arch/powerpc/perf/power8-pmu.c b/arch/powerpc/perf/power8-pmu.c
index 9958ba8bf0d2..690d9186a855 100644
--- a/arch/powerpc/perf/power8-pmu.c
+++ b/arch/powerpc/perf/power8-pmu.c
@@ -17,48 +17,16 @@
 #include <asm/firmware.h>
 #include <asm/cputable.h>
 
-
 /*
  * Some power8 event codes.
  */
-#define PM_CYC				0x0001e
-#define PM_GCT_NOSLOT_CYC		0x100f8
-#define PM_CMPLU_STALL			0x4000a
-#define PM_INST_CMPL			0x00002
-#define PM_BRU_FIN			0x10068
-#define PM_BR_MPRED_CMPL		0x400f6
-
-/* All L1 D cache load references counted at finish, gated by reject */
-#define PM_LD_REF_L1			0x100ee
-/* Load Missed L1 */
-#define PM_LD_MISS_L1			0x3e054
-/* Store Missed L1 */
-#define PM_ST_MISS_L1			0x300f0
-/* L1 cache data prefetches */
-#define PM_L1_PREF			0x0d8b8
-/* Instruction fetches from L1 */
-#define PM_INST_FROM_L1			0x04080
-/* Demand iCache Miss */
-#define PM_L1_ICACHE_MISS		0x200fd
-/* Instruction Demand sectors wriittent into IL1 */
-#define PM_L1_DEMAND_WRITE		0x0408c
-/* Instruction prefetch written into IL1 */
-#define PM_IC_PREF_WRITE		0x0408e
-/* The data cache was reloaded from local core's L3 due to a demand load */
-#define PM_DATA_FROM_L3			0x4c042
-/* Demand LD - L3 Miss (not L2 hit and not L3 hit) */
-#define PM_DATA_FROM_L3MISS		0x300fe
-/* All successful D-side store dispatches for this thread */
-#define PM_L2_ST			0x17080
-/* All successful D-side store dispatches for this thread that were L2 Miss */
-#define PM_L2_ST_MISS			0x17082
-/* Total HW L3 prefetches(Load+store) */
-#define PM_L3_PREF_ALL			0x4e052
-/* Data PTEG reload */
-#define PM_DTLB_MISS			0x300fc
-/* ITLB Reloaded */
-#define PM_ITLB_MISS			0x400fc
+#define EVENT(_name, _code)	_name = _code,
+
+enum {
+#include "power8-events-list.h"
+};
 
+#undef EVENT
 
 /*
  * Raw event encoding for POWER8:
@@ -415,7 +383,7 @@ static int power8_compute_mmcr(u64 event[], int n_ev,
 			pmc_inuse |= 1 << pmc;
 	}
 
-	/* In continous sampling mode, update SDAR on TLB miss */
+	/* In continuous sampling mode, update SDAR on TLB miss */
 	mmcra = MMCRA_SDAR_MODE_TLB;
 	mmcr1 = mmcr2 = 0;
 
@@ -604,6 +572,71 @@ static void power8_disable_pmc(unsigned int pmc, unsigned long mmcr[])
 		mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SHIFT(pmc + 1));
 }
 
+GENERIC_EVENT_ATTR(cpu-cycles,			PM_CYC);
+GENERIC_EVENT_ATTR(stalled-cycles-frontend,	PM_GCT_NOSLOT_CYC);
+GENERIC_EVENT_ATTR(stalled-cycles-backend,	PM_CMPLU_STALL);
+GENERIC_EVENT_ATTR(instructions,		PM_INST_CMPL);
+GENERIC_EVENT_ATTR(branch-instructions,		PM_BRU_FIN);
+GENERIC_EVENT_ATTR(branch-misses,		PM_BR_MPRED_CMPL);
+GENERIC_EVENT_ATTR(cache-references,		PM_LD_REF_L1);
+GENERIC_EVENT_ATTR(cache-misses,		PM_LD_MISS_L1);
+
+CACHE_EVENT_ATTR(L1-dcache-load-misses,		PM_LD_MISS_L1);
+CACHE_EVENT_ATTR(L1-dcache-loads,		PM_LD_REF_L1);
+
+CACHE_EVENT_ATTR(L1-dcache-prefetches,		PM_L1_PREF);
+CACHE_EVENT_ATTR(L1-dcache-store-misses,	PM_ST_MISS_L1);
+CACHE_EVENT_ATTR(L1-icache-load-misses,		PM_L1_ICACHE_MISS);
+CACHE_EVENT_ATTR(L1-icache-loads,		PM_INST_FROM_L1);
+CACHE_EVENT_ATTR(L1-icache-prefetches,		PM_IC_PREF_WRITE);
+
+CACHE_EVENT_ATTR(LLC-load-misses,		PM_DATA_FROM_L3MISS);
+CACHE_EVENT_ATTR(LLC-loads,			PM_DATA_FROM_L3);
+CACHE_EVENT_ATTR(LLC-prefetches,		PM_L3_PREF_ALL);
+CACHE_EVENT_ATTR(LLC-store-misses,		PM_L2_ST_MISS);
+CACHE_EVENT_ATTR(LLC-stores,			PM_L2_ST);
+
+CACHE_EVENT_ATTR(branch-load-misses,		PM_BR_MPRED_CMPL);
+CACHE_EVENT_ATTR(branch-loads,			PM_BRU_FIN);
+CACHE_EVENT_ATTR(dTLB-load-misses,		PM_DTLB_MISS);
+CACHE_EVENT_ATTR(iTLB-load-misses,		PM_ITLB_MISS);
+
+static struct attribute *power8_events_attr[] = {
+	GENERIC_EVENT_PTR(PM_CYC),
+	GENERIC_EVENT_PTR(PM_GCT_NOSLOT_CYC),
+	GENERIC_EVENT_PTR(PM_CMPLU_STALL),
+	GENERIC_EVENT_PTR(PM_INST_CMPL),
+	GENERIC_EVENT_PTR(PM_BRU_FIN),
+	GENERIC_EVENT_PTR(PM_BR_MPRED_CMPL),
+	GENERIC_EVENT_PTR(PM_LD_REF_L1),
+	GENERIC_EVENT_PTR(PM_LD_MISS_L1),
+
+	CACHE_EVENT_PTR(PM_LD_MISS_L1),
+	CACHE_EVENT_PTR(PM_LD_REF_L1),
+	CACHE_EVENT_PTR(PM_L1_PREF),
+	CACHE_EVENT_PTR(PM_ST_MISS_L1),
+	CACHE_EVENT_PTR(PM_L1_ICACHE_MISS),
+	CACHE_EVENT_PTR(PM_INST_FROM_L1),
+	CACHE_EVENT_PTR(PM_IC_PREF_WRITE),
+	CACHE_EVENT_PTR(PM_DATA_FROM_L3MISS),
+	CACHE_EVENT_PTR(PM_DATA_FROM_L3),
+	CACHE_EVENT_PTR(PM_L3_PREF_ALL),
+	CACHE_EVENT_PTR(PM_L2_ST_MISS),
+	CACHE_EVENT_PTR(PM_L2_ST),
+
+	CACHE_EVENT_PTR(PM_BR_MPRED_CMPL),
+	CACHE_EVENT_PTR(PM_BRU_FIN),
+
+	CACHE_EVENT_PTR(PM_DTLB_MISS),
+	CACHE_EVENT_PTR(PM_ITLB_MISS),
+	NULL
+};
+
+static struct attribute_group power8_pmu_events_group = {
+	.name = "events",
+	.attrs = power8_events_attr,
+};
+
 PMU_FORMAT_ATTR(event,		"config:0-49");
 PMU_FORMAT_ATTR(pmcxsel,	"config:0-7");
 PMU_FORMAT_ATTR(mark,		"config:8");
@@ -640,6 +673,7 @@ struct attribute_group power8_pmu_format_group = {
 
 static const struct attribute_group *power8_pmu_attr_groups[] = {
 	&power8_pmu_format_group,
+	&power8_pmu_events_group,
 	NULL,
 };
 
diff --git a/arch/powerpc/platforms/52xx/mpc52xx_pci.c b/arch/powerpc/platforms/52xx/mpc52xx_pci.c
index 6eb3b2abae90..00282c2b0cae 100644
--- a/arch/powerpc/platforms/52xx/mpc52xx_pci.c
+++ b/arch/powerpc/platforms/52xx/mpc52xx_pci.c
@@ -319,7 +319,7 @@ mpc52xx_pci_setup(struct pci_controller *hose,
 
 	tmp = in_be32(&pci_regs->gscr);
 #if 0
-	/* Reset the exteral bus ( internal PCI controller is NOT resetted ) */
+	/* Reset the exteral bus ( internal PCI controller is NOT reset ) */
 	/* Not necessary and can be a bad thing if for example the bootloader
 	   is displaying a splash screen or ... Just left here for
 	   documentation purpose if anyone need it */
diff --git a/arch/powerpc/platforms/85xx/mpc85xx_cds.c b/arch/powerpc/platforms/85xx/mpc85xx_cds.c
index 5ac70de3e48a..d7e87ff912d7 100644
--- a/arch/powerpc/platforms/85xx/mpc85xx_cds.c
+++ b/arch/powerpc/platforms/85xx/mpc85xx_cds.c
@@ -99,7 +99,7 @@ static void mpc85xx_cds_restart(char *cmd)
 		pci_read_config_byte(dev, 0x47, &tmp);
 
 		/*
-		 *  At this point, the harware reset should have triggered.
+		 *  At this point, the hardware reset should have triggered.
 		 *  However, if it doesn't work for some mysterious reason,
 		 *  just fall through to the default reset below.
 		 */
diff --git a/arch/powerpc/platforms/powermac/cache.S b/arch/powerpc/platforms/powermac/cache.S
index 6be1a4af3359..cc5347eb1662 100644
--- a/arch/powerpc/platforms/powermac/cache.S
+++ b/arch/powerpc/platforms/powermac/cache.S
@@ -23,7 +23,7 @@
  * when going to sleep, when doing a PMU based cpufreq transition,
  * or when "offlining" a CPU on SMP machines. This code is over
  * paranoid, but I've had enough issues with various CPU revs and
- * bugs that I decided it was worth beeing over cautious
+ * bugs that I decided it was worth being over cautious
  */
 
 _GLOBAL(flush_disable_caches)
diff --git a/arch/powerpc/platforms/powermac/feature.c b/arch/powerpc/platforms/powermac/feature.c
index 4882bfd90e27..1e02328c3f2d 100644
--- a/arch/powerpc/platforms/powermac/feature.c
+++ b/arch/powerpc/platforms/powermac/feature.c
@@ -198,7 +198,7 @@ static long ohare_htw_scc_enable(struct device_node *node, long param,
 			if (htw) {
 				/* Side effect: this will also power up the
 				 * modem, but it's too messy to figure out on which
-				 * ports this controls the tranceiver and on which
+				 * ports this controls the transceiver and on which
 				 * it controls the modem
 				 */
 				if (trans)
@@ -463,7 +463,7 @@ static long heathrow_sound_enable(struct device_node *node, long param,
 	unsigned long		flags;
 
 	/* B&W G3 and Yikes don't support that properly (the
-	 * sound appear to never come back after beeing shut down).
+	 * sound appear to never come back after being shut down).
 	 */
 	if (pmac_mb.model_id == PMAC_TYPE_YOSEMITE ||
 	    pmac_mb.model_id == PMAC_TYPE_YIKES)
@@ -2770,7 +2770,7 @@ set_initial_features(void)
 	 * but I'm not too sure it was audited for side-effects on other
 	 * ohare based machines...
 	 * Since I still have difficulties figuring the right way to
-	 * differenciate them all and since that hack was there for a long
+	 * differentiate them all and since that hack was there for a long
 	 * time, I'll keep it around
 	 */
 	if (macio_chips[0].type == macio_ohare) {
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index f1516b5ecec9..cd9711e72df6 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -5,7 +5,7 @@ obj-y			+= opal-msglog.o opal-hmi.o opal-power.o opal-irqchip.o
 obj-y			+= opal-kmsg.o
 
 obj-$(CONFIG_SMP)	+= smp.o subcore.o subcore-asm.o
-obj-$(CONFIG_PCI)	+= pci.o pci-p5ioc2.o pci-ioda.o npu-dma.o
+obj-$(CONFIG_PCI)	+= pci.o pci-ioda.o npu-dma.o
 obj-$(CONFIG_EEH)	+= eeh-powernv.o
 obj-$(CONFIG_PPC_SCOM)	+= opal-xscom.o
 obj-$(CONFIG_MEMORY_FAILURE)	+= opal-memory-errors.o
diff --git a/arch/powerpc/platforms/powernv/eeh-powernv.c b/arch/powerpc/platforms/powernv/eeh-powernv.c
index 5f152b95ca0c..950b3e539057 100644
--- a/arch/powerpc/platforms/powernv/eeh-powernv.c
+++ b/arch/powerpc/platforms/powernv/eeh-powernv.c
@@ -167,42 +167,26 @@ static int pnv_eeh_dbgfs_get(void *data, int offset, u64 *val)
 	return 0;
 }
 
-static int pnv_eeh_outb_dbgfs_set(void *data, u64 val)
-{
-	return pnv_eeh_dbgfs_set(data, 0xD10, val);
-}
-
-static int pnv_eeh_outb_dbgfs_get(void *data, u64 *val)
-{
-	return pnv_eeh_dbgfs_get(data, 0xD10, val);
-}
-
-static int pnv_eeh_inbA_dbgfs_set(void *data, u64 val)
-{
-	return pnv_eeh_dbgfs_set(data, 0xD90, val);
-}
-
-static int pnv_eeh_inbA_dbgfs_get(void *data, u64 *val)
-{
-	return pnv_eeh_dbgfs_get(data, 0xD90, val);
-}
-
-static int pnv_eeh_inbB_dbgfs_set(void *data, u64 val)
-{
-	return pnv_eeh_dbgfs_set(data, 0xE10, val);
-}
+#define PNV_EEH_DBGFS_ENTRY(name, reg)				\
+static int pnv_eeh_dbgfs_set_##name(void *data, u64 val)	\
+{								\
+	return pnv_eeh_dbgfs_set(data, reg, val);		\
+}								\
+								\
+static int pnv_eeh_dbgfs_get_##name(void *data, u64 *val)	\
+{								\
+	return pnv_eeh_dbgfs_get(data, reg, val);		\
+}								\
+								\
+DEFINE_SIMPLE_ATTRIBUTE(pnv_eeh_dbgfs_ops_##name,		\
+			pnv_eeh_dbgfs_get_##name,		\
+                        pnv_eeh_dbgfs_set_##name,		\
+			"0x%llx\n")
+
+PNV_EEH_DBGFS_ENTRY(outb, 0xD10);
+PNV_EEH_DBGFS_ENTRY(inbA, 0xD90);
+PNV_EEH_DBGFS_ENTRY(inbB, 0xE10);
 
-static int pnv_eeh_inbB_dbgfs_get(void *data, u64 *val)
-{
-	return pnv_eeh_dbgfs_get(data, 0xE10, val);
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(pnv_eeh_outb_dbgfs_ops, pnv_eeh_outb_dbgfs_get,
-			pnv_eeh_outb_dbgfs_set, "0x%llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(pnv_eeh_inbA_dbgfs_ops, pnv_eeh_inbA_dbgfs_get,
-			pnv_eeh_inbA_dbgfs_set, "0x%llx\n");
-DEFINE_SIMPLE_ATTRIBUTE(pnv_eeh_inbB_dbgfs_ops, pnv_eeh_inbB_dbgfs_get,
-			pnv_eeh_inbB_dbgfs_set, "0x%llx\n");
 #endif /* CONFIG_DEBUG_FS */
 
 /**
@@ -268,13 +252,13 @@ static int pnv_eeh_post_init(void)
 
 		debugfs_create_file("err_injct_outbound", 0600,
 				    phb->dbgfs, hose,
-				    &pnv_eeh_outb_dbgfs_ops);
+				    &pnv_eeh_dbgfs_ops_outb);
 		debugfs_create_file("err_injct_inboundA", 0600,
 				    phb->dbgfs, hose,
-				    &pnv_eeh_inbA_dbgfs_ops);
+				    &pnv_eeh_dbgfs_ops_inbA);
 		debugfs_create_file("err_injct_inboundB", 0600,
 				    phb->dbgfs, hose,
-				    &pnv_eeh_inbB_dbgfs_ops);
+				    &pnv_eeh_dbgfs_ops_inbB);
 #endif /* CONFIG_DEBUG_FS */
 	}
 
@@ -387,6 +371,7 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
 	edev->mode	&= 0xFFFFFF00;
 	edev->pcix_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_PCIX);
 	edev->pcie_cap = pnv_eeh_find_cap(pdn, PCI_CAP_ID_EXP);
+	edev->af_cap   = pnv_eeh_find_cap(pdn, PCI_CAP_ID_AF);
 	edev->aer_cap  = pnv_eeh_find_ecap(pdn, PCI_EXT_CAP_ID_ERR);
 	if ((edev->class_code >> 8) == PCI_CLASS_BRIDGE_PCI) {
 		edev->mode |= EEH_DEV_BRIDGE;
@@ -444,9 +429,12 @@ static void *pnv_eeh_probe(struct pci_dn *pdn, void *data)
 	 * PCI devices of the PE are expected to be removed prior
 	 * to PE reset.
 	 */
-	if (!edev->pe->bus)
+	if (!(edev->pe->state & EEH_PE_PRI_BUS)) {
 		edev->pe->bus = pci_find_bus(hose->global_number,
 					     pdn->busno);
+		if (edev->pe->bus)
+			edev->pe->state |= EEH_PE_PRI_BUS;
+	}
 
 	/*
 	 * Enable EEH explicitly so that we will do EEH check
@@ -892,6 +880,120 @@ void pnv_pci_reset_secondary_bus(struct pci_dev *dev)
 	}
 }
 
+static void pnv_eeh_wait_for_pending(struct pci_dn *pdn, const char *type,
+				     int pos, u16 mask)
+{
+	struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+	int i, status = 0;
+
+	/* Wait for Transaction Pending bit to be cleared */
+	for (i = 0; i < 4; i++) {
+		eeh_ops->read_config(pdn, pos, 2, &status);
+		if (!(status & mask))
+			return;
+
+		msleep((1 << i) * 100);
+	}
+
+	pr_warn("%s: Pending transaction while issuing %sFLR to %04x:%02x:%02x.%01x\n",
+		__func__, type,
+		edev->phb->global_number, pdn->busno,
+		PCI_SLOT(pdn->devfn), PCI_FUNC(pdn->devfn));
+}
+
+static int pnv_eeh_do_flr(struct pci_dn *pdn, int option)
+{
+	struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+	u32 reg = 0;
+
+	if (WARN_ON(!edev->pcie_cap))
+		return -ENOTTY;
+
+	eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCAP, 4, &reg);
+	if (!(reg & PCI_EXP_DEVCAP_FLR))
+		return -ENOTTY;
+
+	switch (option) {
+	case EEH_RESET_HOT:
+	case EEH_RESET_FUNDAMENTAL:
+		pnv_eeh_wait_for_pending(pdn, "",
+					 edev->pcie_cap + PCI_EXP_DEVSTA,
+					 PCI_EXP_DEVSTA_TRPND);
+		eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+				     4, &reg);
+		reg |= PCI_EXP_DEVCTL_BCR_FLR;
+		eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+				      4, reg);
+		msleep(EEH_PE_RST_HOLD_TIME);
+		break;
+	case EEH_RESET_DEACTIVATE:
+		eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+				     4, &reg);
+		reg &= ~PCI_EXP_DEVCTL_BCR_FLR;
+		eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+				      4, reg);
+		msleep(EEH_PE_RST_SETTLE_TIME);
+		break;
+	}
+
+	return 0;
+}
+
+static int pnv_eeh_do_af_flr(struct pci_dn *pdn, int option)
+{
+	struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+	u32 cap = 0;
+
+	if (WARN_ON(!edev->af_cap))
+		return -ENOTTY;
+
+	eeh_ops->read_config(pdn, edev->af_cap + PCI_AF_CAP, 1, &cap);
+	if (!(cap & PCI_AF_CAP_TP) || !(cap & PCI_AF_CAP_FLR))
+		return -ENOTTY;
+
+	switch (option) {
+	case EEH_RESET_HOT:
+	case EEH_RESET_FUNDAMENTAL:
+		/*
+		 * Wait for Transaction Pending bit to clear. A word-aligned
+		 * test is used, so we use the conrol offset rather than status
+		 * and shift the test bit to match.
+		 */
+		pnv_eeh_wait_for_pending(pdn, "AF",
+					 edev->af_cap + PCI_AF_CTRL,
+					 PCI_AF_STATUS_TP << 8);
+		eeh_ops->write_config(pdn, edev->af_cap + PCI_AF_CTRL,
+				      1, PCI_AF_CTRL_FLR);
+		msleep(EEH_PE_RST_HOLD_TIME);
+		break;
+	case EEH_RESET_DEACTIVATE:
+		eeh_ops->write_config(pdn, edev->af_cap + PCI_AF_CTRL, 1, 0);
+		msleep(EEH_PE_RST_SETTLE_TIME);
+		break;
+	}
+
+	return 0;
+}
+
+static int pnv_eeh_reset_vf_pe(struct eeh_pe *pe, int option)
+{
+	struct eeh_dev *edev;
+	struct pci_dn *pdn;
+	int ret;
+
+	/* The VF PE should have only one child device */
+	edev = list_first_entry_or_null(&pe->edevs, struct eeh_dev, list);
+	pdn = eeh_dev_to_pdn(edev);
+	if (!pdn)
+		return -ENXIO;
+
+	ret = pnv_eeh_do_flr(pdn, option);
+	if (!ret)
+		return ret;
+
+	return pnv_eeh_do_af_flr(pdn, option);
+}
+
 /**
  * pnv_eeh_reset - Reset the specified PE
  * @pe: EEH PE
@@ -953,7 +1055,9 @@ static int pnv_eeh_reset(struct eeh_pe *pe, int option)
 		}
 
 		bus = eeh_pe_bus_get(pe);
-		if (pci_is_root_bus(bus) ||
+		if (pe->type & EEH_PE_VF)
+			ret = pnv_eeh_reset_vf_pe(pe, option);
+		else if (pci_is_root_bus(bus) ||
 			pci_is_root_bus(bus->parent))
 			ret = pnv_eeh_root_reset(hose, option);
 		else
@@ -1092,6 +1196,14 @@ static inline bool pnv_eeh_cfg_blocked(struct pci_dn *pdn)
 	if (!edev || !edev->pe)
 		return false;
 
+	/*
+	 * We will issue FLR or AF FLR to all VFs, which are contained
+	 * in VF PE. It relies on the EEH PCI config accessors. So we
+	 * can't block them during the window.
+	 */
+	if (edev->physfn && (edev->pe->state & EEH_PE_RESET))
+		return false;
+
 	if (edev->pe->state & EEH_PE_CFG_BLOCKED)
 		return true;
 
@@ -1476,6 +1588,65 @@ static int pnv_eeh_next_error(struct eeh_pe **pe)
 	return ret;
 }
 
+static int pnv_eeh_restore_vf_config(struct pci_dn *pdn)
+{
+	struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
+	u32 devctl, cmd, cap2, aer_capctl;
+	int old_mps;
+
+	if (edev->pcie_cap) {
+		/* Restore MPS */
+		old_mps = (ffs(pdn->mps) - 8) << 5;
+		eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+				     2, &devctl);
+		devctl &= ~PCI_EXP_DEVCTL_PAYLOAD;
+		devctl |= old_mps;
+		eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+				      2, devctl);
+
+		/* Disable Completion Timeout */
+		eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCAP2,
+				     4, &cap2);
+		if (cap2 & 0x10) {
+			eeh_ops->read_config(pdn,
+					     edev->pcie_cap + PCI_EXP_DEVCTL2,
+					     4, &cap2);
+			cap2 |= 0x10;
+			eeh_ops->write_config(pdn,
+					      edev->pcie_cap + PCI_EXP_DEVCTL2,
+					      4, cap2);
+		}
+	}
+
+	/* Enable SERR and parity checking */
+	eeh_ops->read_config(pdn, PCI_COMMAND, 2, &cmd);
+	cmd |= (PCI_COMMAND_PARITY | PCI_COMMAND_SERR);
+	eeh_ops->write_config(pdn, PCI_COMMAND, 2, cmd);
+
+	/* Enable report various errors */
+	if (edev->pcie_cap) {
+		eeh_ops->read_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+				     2, &devctl);
+		devctl &= ~PCI_EXP_DEVCTL_CERE;
+		devctl |= (PCI_EXP_DEVCTL_NFERE |
+			   PCI_EXP_DEVCTL_FERE |
+			   PCI_EXP_DEVCTL_URRE);
+		eeh_ops->write_config(pdn, edev->pcie_cap + PCI_EXP_DEVCTL,
+				      2, devctl);
+	}
+
+	/* Enable ECRC generation and check */
+	if (edev->pcie_cap && edev->aer_cap) {
+		eeh_ops->read_config(pdn, edev->aer_cap + PCI_ERR_CAP,
+				     4, &aer_capctl);
+		aer_capctl |= (PCI_ERR_CAP_ECRC_GENE | PCI_ERR_CAP_ECRC_CHKE);
+		eeh_ops->write_config(pdn, edev->aer_cap + PCI_ERR_CAP,
+				      4, aer_capctl);
+	}
+
+	return 0;
+}
+
 static int pnv_eeh_restore_config(struct pci_dn *pdn)
 {
 	struct eeh_dev *edev = pdn_to_eeh_dev(pdn);
@@ -1485,9 +1656,21 @@ static int pnv_eeh_restore_config(struct pci_dn *pdn)
 	if (!edev)
 		return -EEXIST;
 
-	phb = edev->phb->private_data;
-	ret = opal_pci_reinit(phb->opal_id,
-			      OPAL_REINIT_PCI_DEV, edev->config_addr);
+	/*
+	 * We have to restore the PCI config space after reset since the
+	 * firmware can't see SRIOV VFs.
+	 *
+	 * FIXME: The MPS, error routing rules, timeout setting are worthy
+	 * to be exported by firmware in extendible way.
+	 */
+	if (edev->physfn) {
+		ret = pnv_eeh_restore_vf_config(pdn);
+	} else {
+		phb = edev->phb->private_data;
+		ret = opal_pci_reinit(phb->opal_id,
+				      OPAL_REINIT_PCI_DEV, edev->config_addr);
+	}
+
 	if (ret) {
 		pr_warn("%s: Can't reinit PCI dev 0x%x (%lld)\n",
 			__func__, edev->config_addr, ret);
@@ -1516,6 +1699,40 @@ static struct eeh_ops pnv_eeh_ops = {
 	.restore_config		= pnv_eeh_restore_config
 };
 
+void pcibios_bus_add_device(struct pci_dev *pdev)
+{
+	struct pci_dn *pdn = pci_get_pdn(pdev);
+
+	if (!pdev->is_virtfn)
+		return;
+
+	/*
+	 * The following operations will fail if VF's sysfs files
+	 * aren't created or its resources aren't finalized.
+	 */
+	eeh_add_device_early(pdn);
+	eeh_add_device_late(pdev);
+	eeh_sysfs_add_device(pdev);
+}
+
+#ifdef CONFIG_PCI_IOV
+static void pnv_pci_fixup_vf_mps(struct pci_dev *pdev)
+{
+	struct pci_dn *pdn = pci_get_pdn(pdev);
+	int parent_mps;
+
+	if (!pdev->is_virtfn)
+		return;
+
+	/* Synchronize MPS for VF and PF */
+	parent_mps = pcie_get_mps(pdev->physfn);
+	if ((128 << pdev->pcie_mpss) >= parent_mps)
+		pcie_set_mps(pdev, parent_mps);
+	pdn->mps = pcie_get_mps(pdev);
+}
+DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pnv_pci_fixup_vf_mps);
+#endif /* CONFIG_PCI_IOV */
+
 /**
  * eeh_powernv_init - Register platform dependent EEH operations
  *
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index 15bfbcd5debc..fcc8b6861b63 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -35,9 +35,9 @@ int pnv_save_sprs_for_winkle(void)
 	int rc;
 
 	/*
-	 * hid0, hid1, hid4, hid5, hmeer and lpcr values are symmetric accross
+	 * hid0, hid1, hid4, hid5, hmeer and lpcr values are symmetric across
 	 * all cpus at boot. Get these reg values of current cpu and use the
-	 * same accross all cpus.
+	 * same across all cpus.
 	 */
 	uint64_t lpcr_val = mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1;
 	uint64_t hid0_val = mfspr(SPRN_HID0);
@@ -185,7 +185,7 @@ static ssize_t store_fastsleep_workaround_applyonce(struct device *dev,
 	 * fastsleep workaround needs to be left in 'applied' state on all
 	 * the cores. Do this by-
 	 * 1. Patching out the call to 'undo' workaround in fastsleep exit path
-	 * 2. Sending ipi to all the cores which have atleast one online thread
+	 * 2. Sending ipi to all the cores which have at least one online thread
 	 * 3. Patching out the call to 'apply' workaround in fastsleep entry
 	 * path
 	 * There is no need to send ipi to cores which have all threads
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index e85aa900f5c0..7229acd9bb3a 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -278,7 +278,7 @@ static void pnv_npu_disable_bypass(struct pnv_ioda_pe *npe)
 
 /*
  * Enable/disable bypass mode on the NPU. The NPU only supports one
- * window per link, so bypass needs to be explicity enabled or
+ * window per link, so bypass needs to be explicitly enabled or
  * disabled. Unlike for a PHB3 bypass and non-bypass modes can't be
  * active at the same time.
  */
diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c b/arch/powerpc/platforms/powernv/opal-msglog.c
index 44ed78af1a0d..39d6ff9e5630 100644
--- a/arch/powerpc/platforms/powernv/opal-msglog.c
+++ b/arch/powerpc/platforms/powernv/opal-msglog.c
@@ -31,26 +31,25 @@ struct memcons {
 	__be32 in_cons;
 };
 
-static ssize_t opal_msglog_read(struct file *file, struct kobject *kobj,
-				struct bin_attribute *bin_attr, char *to,
-				loff_t pos, size_t count)
+static struct memcons *opal_memcons = NULL;
+
+ssize_t opal_msglog_copy(char *to, loff_t pos, size_t count)
 {
-	struct memcons *mc = bin_attr->private;
 	const char *conbuf;
 	ssize_t ret;
 	size_t first_read = 0;
 	uint32_t out_pos, avail;
 
-	if (!mc)
+	if (!opal_memcons)
 		return -ENODEV;
 
-	out_pos = be32_to_cpu(ACCESS_ONCE(mc->out_pos));
+	out_pos = be32_to_cpu(ACCESS_ONCE(opal_memcons->out_pos));
 
 	/* Now we've read out_pos, put a barrier in before reading the new
 	 * data it points to in conbuf. */
 	smp_rmb();
 
-	conbuf = phys_to_virt(be64_to_cpu(mc->obuf_phys));
+	conbuf = phys_to_virt(be64_to_cpu(opal_memcons->obuf_phys));
 
 	/* When the buffer has wrapped, read from the out_pos marker to the end
 	 * of the buffer, and then read the remaining data as in the un-wrapped
@@ -58,7 +57,7 @@ static ssize_t opal_msglog_read(struct file *file, struct kobject *kobj,
 	if (out_pos & MEMCONS_OUT_POS_WRAP) {
 
 		out_pos &= MEMCONS_OUT_POS_MASK;
-		avail = be32_to_cpu(mc->obuf_size) - out_pos;
+		avail = be32_to_cpu(opal_memcons->obuf_size) - out_pos;
 
 		ret = memory_read_from_buffer(to, count, &pos,
 				conbuf + out_pos, avail);
@@ -76,7 +75,7 @@ static ssize_t opal_msglog_read(struct file *file, struct kobject *kobj,
 	}
 
 	/* Sanity check. The firmware should not do this to us. */
-	if (out_pos > be32_to_cpu(mc->obuf_size)) {
+	if (out_pos > be32_to_cpu(opal_memcons->obuf_size)) {
 		pr_err("OPAL: memory console corruption. Aborting read.\n");
 		return -EINVAL;
 	}
@@ -91,6 +90,13 @@ out:
 	return ret;
 }
 
+static ssize_t opal_msglog_read(struct file *file, struct kobject *kobj,
+				struct bin_attribute *bin_attr, char *to,
+				loff_t pos, size_t count)
+{
+	return opal_msglog_copy(to, pos, count);
+}
+
 static struct bin_attribute opal_msglog_attr = {
 	.attr = {.name = "msglog", .mode = 0444},
 	.read = opal_msglog_read
@@ -117,7 +123,15 @@ void __init opal_msglog_init(void)
 		return;
 	}
 
-	opal_msglog_attr.private = mc;
+	opal_memcons = mc;
+}
+
+void __init opal_msglog_sysfs_init(void)
+{
+	if (!opal_memcons) {
+		pr_warn("OPAL: message log initialisation failed, not creating sysfs entry\n");
+		return;
+	}
 
 	if (sysfs_create_bin_file(opal_kobj, &opal_msglog_attr) != 0)
 		pr_warn("OPAL: sysfs file creation failed\n");
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 4e0da5af94a1..0256d0729252 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -724,6 +724,9 @@ static int __init opal_init(void)
 		of_node_put(leds);
 	}
 
+	/* Initialise OPAL message log interface */
+	opal_msglog_init();
+
 	/* Create "opal" kobject under /sys/firmware */
 	rc = opal_sysfs_init();
 	if (rc == 0) {
@@ -739,8 +742,8 @@ static int __init opal_init(void)
 		opal_platform_dump_init();
 		/* Setup system parameters interface */
 		opal_sys_param_init();
-		/* Setup message log interface. */
-		opal_msglog_init();
+		/* Setup message log sysfs interface. */
+		opal_msglog_sysfs_init();
 	}
 
 	/* Initialize platform devices: IPMI backend, PRD & flash interface */
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 573ae1994097..c5baaf3cc4e5 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -872,9 +872,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
 		if (!res->flags || !res->parent)
 			continue;
 
-		if (!pnv_pci_is_mem_pref_64(res->flags))
-			continue;
-
 		/*
 		 * The actual IOV BAR range is determined by the start address
 		 * and the actual size for num_vfs VFs BAR.  This check is to
@@ -903,9 +900,6 @@ static int pnv_pci_vf_resource_shift(struct pci_dev *dev, int offset)
 		if (!res->flags || !res->parent)
 			continue;
 
-		if (!pnv_pci_is_mem_pref_64(res->flags))
-			continue;
-
 		size = pci_iov_resource_size(dev, i + PCI_IOV_RESOURCES);
 		res2 = *res;
 		res->start += size * offset;
@@ -1196,29 +1190,36 @@ static void pnv_pci_ioda_setup_PEs(void)
 }
 
 #ifdef CONFIG_PCI_IOV
-static int pnv_pci_vf_release_m64(struct pci_dev *pdev)
+static int pnv_pci_vf_release_m64(struct pci_dev *pdev, u16 num_vfs)
 {
 	struct pci_bus        *bus;
 	struct pci_controller *hose;
 	struct pnv_phb        *phb;
 	struct pci_dn         *pdn;
 	int                    i, j;
+	int                    m64_bars;
 
 	bus = pdev->bus;
 	hose = pci_bus_to_host(bus);
 	phb = hose->private_data;
 	pdn = pci_get_pdn(pdev);
 
+	if (pdn->m64_single_mode)
+		m64_bars = num_vfs;
+	else
+		m64_bars = 1;
+
 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
-		for (j = 0; j < M64_PER_IOV; j++) {
-			if (pdn->m64_wins[i][j] == IODA_INVALID_M64)
+		for (j = 0; j < m64_bars; j++) {
+			if (pdn->m64_map[j][i] == IODA_INVALID_M64)
 				continue;
 			opal_pci_phb_mmio_enable(phb->opal_id,
-				OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 0);
-			clear_bit(pdn->m64_wins[i][j], &phb->ioda.m64_bar_alloc);
-			pdn->m64_wins[i][j] = IODA_INVALID_M64;
+				OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 0);
+			clear_bit(pdn->m64_map[j][i], &phb->ioda.m64_bar_alloc);
+			pdn->m64_map[j][i] = IODA_INVALID_M64;
 		}
 
+	kfree(pdn->m64_map);
 	return 0;
 }
 
@@ -1235,8 +1236,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 	int                    total_vfs;
 	resource_size_t        size, start;
 	int                    pe_num;
-	int                    vf_groups;
-	int                    vf_per_group;
+	int                    m64_bars;
 
 	bus = pdev->bus;
 	hose = pci_bus_to_host(bus);
@@ -1244,29 +1244,26 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 	pdn = pci_get_pdn(pdev);
 	total_vfs = pci_sriov_get_totalvfs(pdev);
 
-	/* Initialize the m64_wins to IODA_INVALID_M64 */
-	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++)
-		for (j = 0; j < M64_PER_IOV; j++)
-			pdn->m64_wins[i][j] = IODA_INVALID_M64;
+	if (pdn->m64_single_mode)
+		m64_bars = num_vfs;
+	else
+		m64_bars = 1;
+
+	pdn->m64_map = kmalloc(sizeof(*pdn->m64_map) * m64_bars, GFP_KERNEL);
+	if (!pdn->m64_map)
+		return -ENOMEM;
+	/* Initialize the m64_map to IODA_INVALID_M64 */
+	for (i = 0; i < m64_bars ; i++)
+		for (j = 0; j < PCI_SRIOV_NUM_BARS; j++)
+			pdn->m64_map[i][j] = IODA_INVALID_M64;
 
-	if (pdn->m64_per_iov == M64_PER_IOV) {
-		vf_groups = (num_vfs <= M64_PER_IOV) ? num_vfs: M64_PER_IOV;
-		vf_per_group = (num_vfs <= M64_PER_IOV)? 1:
-			roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
-	} else {
-		vf_groups = 1;
-		vf_per_group = 1;
-	}
 
 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
 		if (!res->flags || !res->parent)
 			continue;
 
-		if (!pnv_pci_is_mem_pref_64(res->flags))
-			continue;
-
-		for (j = 0; j < vf_groups; j++) {
+		for (j = 0; j < m64_bars; j++) {
 			do {
 				win = find_next_zero_bit(&phb->ioda.m64_bar_alloc,
 						phb->ioda.m64_bar_idx + 1, 0);
@@ -1275,12 +1272,11 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 					goto m64_failed;
 			} while (test_and_set_bit(win, &phb->ioda.m64_bar_alloc));
 
-			pdn->m64_wins[i][j] = win;
+			pdn->m64_map[j][i] = win;
 
-			if (pdn->m64_per_iov == M64_PER_IOV) {
+			if (pdn->m64_single_mode) {
 				size = pci_iov_resource_size(pdev,
 							PCI_IOV_RESOURCES + i);
-				size = size * vf_per_group;
 				start = res->start + size * j;
 			} else {
 				size = resource_size(res);
@@ -1288,16 +1284,16 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 			}
 
 			/* Map the M64 here */
-			if (pdn->m64_per_iov == M64_PER_IOV) {
-				pe_num = pdn->offset + j;
+			if (pdn->m64_single_mode) {
+				pe_num = pdn->pe_num_map[j];
 				rc = opal_pci_map_pe_mmio_window(phb->opal_id,
 						pe_num, OPAL_M64_WINDOW_TYPE,
-						pdn->m64_wins[i][j], 0);
+						pdn->m64_map[j][i], 0);
 			}
 
 			rc = opal_pci_set_phb_mem_window(phb->opal_id,
 						 OPAL_M64_WINDOW_TYPE,
-						 pdn->m64_wins[i][j],
+						 pdn->m64_map[j][i],
 						 start,
 						 0, /* unused */
 						 size);
@@ -1309,12 +1305,12 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 				goto m64_failed;
 			}
 
-			if (pdn->m64_per_iov == M64_PER_IOV)
+			if (pdn->m64_single_mode)
 				rc = opal_pci_phb_mmio_enable(phb->opal_id,
-				     OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 2);
+				     OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 2);
 			else
 				rc = opal_pci_phb_mmio_enable(phb->opal_id,
-				     OPAL_M64_WINDOW_TYPE, pdn->m64_wins[i][j], 1);
+				     OPAL_M64_WINDOW_TYPE, pdn->m64_map[j][i], 1);
 
 			if (rc != OPAL_SUCCESS) {
 				dev_err(&pdev->dev, "Failed to enable M64 window #%d: %llx\n",
@@ -1326,7 +1322,7 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs)
 	return 0;
 
 m64_failed:
-	pnv_pci_vf_release_m64(pdev);
+	pnv_pci_vf_release_m64(pdev, num_vfs);
 	return -EBUSY;
 }
 
@@ -1353,15 +1349,13 @@ static void pnv_pci_ioda2_release_dma_pe(struct pci_dev *dev, struct pnv_ioda_pe
 	iommu_free_table(tbl, of_node_full_name(dev->dev.of_node));
 }
 
-static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
+static void pnv_ioda_release_vf_PE(struct pci_dev *pdev)
 {
 	struct pci_bus        *bus;
 	struct pci_controller *hose;
 	struct pnv_phb        *phb;
 	struct pnv_ioda_pe    *pe, *pe_n;
 	struct pci_dn         *pdn;
-	u16                    vf_index;
-	int64_t                rc;
 
 	bus = pdev->bus;
 	hose = pci_bus_to_host(bus);
@@ -1371,35 +1365,6 @@ static void pnv_ioda_release_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 	if (!pdev->is_physfn)
 		return;
 
-	if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
-		int   vf_group;
-		int   vf_per_group;
-		int   vf_index1;
-
-		vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
-
-		for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++)
-			for (vf_index = vf_group * vf_per_group;
-				vf_index < (vf_group + 1) * vf_per_group &&
-				vf_index < num_vfs;
-				vf_index++)
-				for (vf_index1 = vf_group * vf_per_group;
-					vf_index1 < (vf_group + 1) * vf_per_group &&
-					vf_index1 < num_vfs;
-					vf_index1++){
-
-					rc = opal_pci_set_peltv(phb->opal_id,
-						pdn->offset + vf_index,
-						pdn->offset + vf_index1,
-						OPAL_REMOVE_PE_FROM_DOMAIN);
-
-					if (rc)
-					    dev_warn(&pdev->dev, "%s: Failed to unlink same group PE#%d(%lld)\n",
-						__func__,
-						pdn->offset + vf_index1, rc);
-				}
-	}
-
 	list_for_each_entry_safe(pe, pe_n, &phb->ioda.pe_list, list) {
 		if (pe->parent_dev != pdev)
 			continue;
@@ -1424,7 +1389,7 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
 	struct pnv_phb        *phb;
 	struct pci_dn         *pdn;
 	struct pci_sriov      *iov;
-	u16 num_vfs;
+	u16                    num_vfs, i;
 
 	bus = pdev->bus;
 	hose = pci_bus_to_host(bus);
@@ -1434,18 +1399,25 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
 	num_vfs = pdn->num_vfs;
 
 	/* Release VF PEs */
-	pnv_ioda_release_vf_PE(pdev, num_vfs);
+	pnv_ioda_release_vf_PE(pdev);
 
 	if (phb->type == PNV_PHB_IODA2) {
-		if (pdn->m64_per_iov == 1)
-			pnv_pci_vf_resource_shift(pdev, -pdn->offset);
+		if (!pdn->m64_single_mode)
+			pnv_pci_vf_resource_shift(pdev, -*pdn->pe_num_map);
 
 		/* Release M64 windows */
-		pnv_pci_vf_release_m64(pdev);
+		pnv_pci_vf_release_m64(pdev, num_vfs);
 
 		/* Release PE numbers */
-		bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
-		pdn->offset = 0;
+		if (pdn->m64_single_mode) {
+			for (i = 0; i < num_vfs; i++) {
+				if (pdn->pe_num_map[i] != IODA_INVALID_PE)
+					pnv_ioda_free_pe(phb, pdn->pe_num_map[i]);
+			}
+		} else
+			bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
+		/* Releasing pe_num_map */
+		kfree(pdn->pe_num_map);
 	}
 }
 
@@ -1460,7 +1432,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 	int                    pe_num;
 	u16                    vf_index;
 	struct pci_dn         *pdn;
-	int64_t                rc;
 
 	bus = pdev->bus;
 	hose = pci_bus_to_host(bus);
@@ -1472,7 +1443,10 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 
 	/* Reserve PE for each VF */
 	for (vf_index = 0; vf_index < num_vfs; vf_index++) {
-		pe_num = pdn->offset + vf_index;
+		if (pdn->m64_single_mode)
+			pe_num = pdn->pe_num_map[vf_index];
+		else
+			pe_num = *pdn->pe_num_map + vf_index;
 
 		pe = &phb->ioda.pe_array[pe_num];
 		pe->pe_number = pe_num;
@@ -1505,37 +1479,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
 
 		pnv_pci_ioda2_setup_dma_pe(phb, pe);
 	}
-
-	if (pdn->m64_per_iov == M64_PER_IOV && num_vfs > M64_PER_IOV) {
-		int   vf_group;
-		int   vf_per_group;
-		int   vf_index1;
-
-		vf_per_group = roundup_pow_of_two(num_vfs) / pdn->m64_per_iov;
-
-		for (vf_group = 0; vf_group < M64_PER_IOV; vf_group++) {
-			for (vf_index = vf_group * vf_per_group;
-			     vf_index < (vf_group + 1) * vf_per_group &&
-			     vf_index < num_vfs;
-			     vf_index++) {
-				for (vf_index1 = vf_group * vf_per_group;
-				     vf_index1 < (vf_group + 1) * vf_per_group &&
-				     vf_index1 < num_vfs;
-				     vf_index1++) {
-
-					rc = opal_pci_set_peltv(phb->opal_id,
-						pdn->offset + vf_index,
-						pdn->offset + vf_index1,
-						OPAL_ADD_PE_TO_DOMAIN);
-
-					if (rc)
-					    dev_warn(&pdev->dev, "%s: Failed to link same group PE#%d(%lld)\n",
-						__func__,
-						pdn->offset + vf_index1, rc);
-				}
-			}
-		}
-	}
 }
 
 int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
@@ -1545,6 +1488,7 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
 	struct pnv_phb        *phb;
 	struct pci_dn         *pdn;
 	int                    ret;
+	u16                    i;
 
 	bus = pdev->bus;
 	hose = pci_bus_to_host(bus);
@@ -1552,20 +1496,59 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
 	pdn = pci_get_pdn(pdev);
 
 	if (phb->type == PNV_PHB_IODA2) {
+		if (!pdn->vfs_expanded) {
+			dev_info(&pdev->dev, "don't support this SRIOV device"
+				" with non 64bit-prefetchable IOV BAR\n");
+			return -ENOSPC;
+		}
+
+		/*
+		 * When M64 BARs functions in Single PE mode, the number of VFs
+		 * could be enabled must be less than the number of M64 BARs.
+		 */
+		if (pdn->m64_single_mode && num_vfs > phb->ioda.m64_bar_idx) {
+			dev_info(&pdev->dev, "Not enough M64 BAR for VFs\n");
+			return -EBUSY;
+		}
+
+		/* Allocating pe_num_map */
+		if (pdn->m64_single_mode)
+			pdn->pe_num_map = kmalloc(sizeof(*pdn->pe_num_map) * num_vfs,
+					GFP_KERNEL);
+		else
+			pdn->pe_num_map = kmalloc(sizeof(*pdn->pe_num_map), GFP_KERNEL);
+
+		if (!pdn->pe_num_map)
+			return -ENOMEM;
+
+		if (pdn->m64_single_mode)
+			for (i = 0; i < num_vfs; i++)
+				pdn->pe_num_map[i] = IODA_INVALID_PE;
+
 		/* Calculate available PE for required VFs */
-		mutex_lock(&phb->ioda.pe_alloc_mutex);
-		pdn->offset = bitmap_find_next_zero_area(
-			phb->ioda.pe_alloc, phb->ioda.total_pe,
-			0, num_vfs, 0);
-		if (pdn->offset >= phb->ioda.total_pe) {
+		if (pdn->m64_single_mode) {
+			for (i = 0; i < num_vfs; i++) {
+				pdn->pe_num_map[i] = pnv_ioda_alloc_pe(phb);
+				if (pdn->pe_num_map[i] == IODA_INVALID_PE) {
+					ret = -EBUSY;
+					goto m64_failed;
+				}
+			}
+		} else {
+			mutex_lock(&phb->ioda.pe_alloc_mutex);
+			*pdn->pe_num_map = bitmap_find_next_zero_area(
+				phb->ioda.pe_alloc, phb->ioda.total_pe,
+				0, num_vfs, 0);
+			if (*pdn->pe_num_map >= phb->ioda.total_pe) {
+				mutex_unlock(&phb->ioda.pe_alloc_mutex);
+				dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
+				kfree(pdn->pe_num_map);
+				return -EBUSY;
+			}
+			bitmap_set(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
 			mutex_unlock(&phb->ioda.pe_alloc_mutex);
-			dev_info(&pdev->dev, "Failed to enable VF%d\n", num_vfs);
-			pdn->offset = 0;
-			return -EBUSY;
 		}
-		bitmap_set(phb->ioda.pe_alloc, pdn->offset, num_vfs);
 		pdn->num_vfs = num_vfs;
-		mutex_unlock(&phb->ioda.pe_alloc_mutex);
 
 		/* Assign M64 window accordingly */
 		ret = pnv_pci_vf_assign_m64(pdev, num_vfs);
@@ -1579,8 +1562,8 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
 		 * the IOV BAR according to the PE# allocated to the VFs.
 		 * Otherwise, the PE# for the VF will conflict with others.
 		 */
-		if (pdn->m64_per_iov == 1) {
-			ret = pnv_pci_vf_resource_shift(pdev, pdn->offset);
+		if (!pdn->m64_single_mode) {
+			ret = pnv_pci_vf_resource_shift(pdev, *pdn->pe_num_map);
 			if (ret)
 				goto m64_failed;
 		}
@@ -1592,8 +1575,16 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
 	return 0;
 
 m64_failed:
-	bitmap_clear(phb->ioda.pe_alloc, pdn->offset, num_vfs);
-	pdn->offset = 0;
+	if (pdn->m64_single_mode) {
+		for (i = 0; i < num_vfs; i++) {
+			if (pdn->pe_num_map[i] != IODA_INVALID_PE)
+				pnv_ioda_free_pe(phb, pdn->pe_num_map[i]);
+		}
+	} else
+		bitmap_clear(phb->ioda.pe_alloc, *pdn->pe_num_map, num_vfs);
+
+	/* Releasing pe_num_map */
+	kfree(pdn->pe_num_map);
 
 	return ret;
 }
@@ -1612,8 +1603,7 @@ int pcibios_sriov_enable(struct pci_dev *pdev, u16 num_vfs)
 	/* Allocate PCI data */
 	add_dev_pci_data(pdev);
 
-	pnv_pci_sriov_enable(pdev, num_vfs);
-	return 0;
+	return pnv_pci_sriov_enable(pdev, num_vfs);
 }
 #endif /* CONFIG_PCI_IOV */
 
@@ -2851,45 +2841,58 @@ static void pnv_pci_init_ioda_msis(struct pnv_phb *phb) { }
 #ifdef CONFIG_PCI_IOV
 static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
 {
-	struct pci_controller *hose;
-	struct pnv_phb *phb;
+	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+	struct pnv_phb *phb = hose->private_data;
+	const resource_size_t gate = phb->ioda.m64_segsize >> 2;
 	struct resource *res;
 	int i;
-	resource_size_t size;
+	resource_size_t size, total_vf_bar_sz;
 	struct pci_dn *pdn;
 	int mul, total_vfs;
 
 	if (!pdev->is_physfn || pdev->is_added)
 		return;
 
-	hose = pci_bus_to_host(pdev->bus);
-	phb = hose->private_data;
-
 	pdn = pci_get_pdn(pdev);
 	pdn->vfs_expanded = 0;
+	pdn->m64_single_mode = false;
 
 	total_vfs = pci_sriov_get_totalvfs(pdev);
-	pdn->m64_per_iov = 1;
 	mul = phb->ioda.total_pe;
+	total_vf_bar_sz = 0;
 
 	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
 		if (!res->flags || res->parent)
 			continue;
 		if (!pnv_pci_is_mem_pref_64(res->flags)) {
-			dev_warn(&pdev->dev, " non M64 VF BAR%d: %pR\n",
+			dev_warn(&pdev->dev, "Don't support SR-IOV with"
+					" non M64 VF BAR%d: %pR. \n",
 				 i, res);
-			continue;
+			goto truncate_iov;
 		}
 
-		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
+		total_vf_bar_sz += pci_iov_resource_size(pdev,
+				i + PCI_IOV_RESOURCES);
 
-		/* bigger than 64M */
-		if (size > (1 << 26)) {
-			dev_info(&pdev->dev, "PowerNV: VF BAR%d: %pR IOV size is bigger than 64M, roundup power2\n",
-				 i, res);
-			pdn->m64_per_iov = M64_PER_IOV;
+		/*
+		 * If bigger than quarter of M64 segment size, just round up
+		 * power of two.
+		 *
+		 * Generally, one M64 BAR maps one IOV BAR. To avoid conflict
+		 * with other devices, IOV BAR size is expanded to be
+		 * (total_pe * VF_BAR_size).  When VF_BAR_size is half of M64
+		 * segment size , the expanded size would equal to half of the
+		 * whole M64 space size, which will exhaust the M64 Space and
+		 * limit the system flexibility.  This is a design decision to
+		 * set the boundary to quarter of the M64 segment size.
+		 */
+		if (total_vf_bar_sz > gate) {
 			mul = roundup_pow_of_two(total_vfs);
+			dev_info(&pdev->dev,
+				"VF BAR Total IOV size %llx > %llx, roundup to %d VFs\n",
+				total_vf_bar_sz, gate, mul);
+			pdn->m64_single_mode = true;
 			break;
 		}
 	}
@@ -2898,20 +2901,31 @@ static void pnv_pci_ioda_fixup_iov_resources(struct pci_dev *pdev)
 		res = &pdev->resource[i + PCI_IOV_RESOURCES];
 		if (!res->flags || res->parent)
 			continue;
-		if (!pnv_pci_is_mem_pref_64(res->flags)) {
-			dev_warn(&pdev->dev, "Skipping expanding VF BAR%d: %pR\n",
-				 i, res);
-			continue;
-		}
 
-		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
 		size = pci_iov_resource_size(pdev, i + PCI_IOV_RESOURCES);
+		/*
+		 * On PHB3, the minimum size alignment of M64 BAR in single
+		 * mode is 32MB.
+		 */
+		if (pdn->m64_single_mode && (size < SZ_32M))
+			goto truncate_iov;
+		dev_dbg(&pdev->dev, " Fixing VF BAR%d: %pR to\n", i, res);
 		res->end = res->start + size * mul - 1;
 		dev_dbg(&pdev->dev, "                       %pR\n", res);
 		dev_info(&pdev->dev, "VF BAR%d: %pR (expanded to %d VFs for PE alignment)",
 			 i, res, mul);
 	}
 	pdn->vfs_expanded = mul;
+
+	return;
+
+truncate_iov:
+	/* To save MMIO space, IOV BAR is truncated. */
+	for (i = 0; i < PCI_SRIOV_NUM_BARS; i++) {
+		res = &pdev->resource[i + PCI_IOV_RESOURCES];
+		res->flags = 0;
+		res->end = res->start - 1;
+	}
 }
 #endif /* CONFIG_PCI_IOV */
 
@@ -3125,18 +3139,35 @@ static resource_size_t pnv_pci_window_alignment(struct pci_bus *bus,
 static resource_size_t pnv_pci_iov_resource_alignment(struct pci_dev *pdev,
 						      int resno)
 {
+	struct pci_controller *hose = pci_bus_to_host(pdev->bus);
+	struct pnv_phb *phb = hose->private_data;
 	struct pci_dn *pdn = pci_get_pdn(pdev);
-	resource_size_t align, iov_align;
-
-	iov_align = resource_size(&pdev->resource[resno]);
-	if (iov_align)
-		return iov_align;
+	resource_size_t align;
 
+	/*
+	 * On PowerNV platform, IOV BAR is mapped by M64 BAR to enable the
+	 * SR-IOV. While from hardware perspective, the range mapped by M64
+	 * BAR should be size aligned.
+	 *
+	 * When IOV BAR is mapped with M64 BAR in Single PE mode, the extra
+	 * powernv-specific hardware restriction is gone. But if just use the
+	 * VF BAR size as the alignment, PF BAR / VF BAR may be allocated with
+	 * in one segment of M64 #15, which introduces the PE conflict between
+	 * PF and VF. Based on this, the minimum alignment of an IOV BAR is
+	 * m64_segsize.
+	 *
+	 * This function returns the total IOV BAR size if M64 BAR is in
+	 * Shared PE mode or just VF BAR size if not.
+	 * If the M64 BAR is in Single PE mode, return the VF BAR size or
+	 * M64 segment size if IOV BAR size is less.
+	 */
 	align = pci_iov_resource_size(pdev, resno);
-	if (pdn->vfs_expanded)
-		return pdn->vfs_expanded * align;
+	if (!pdn->vfs_expanded)
+		return align;
+	if (pdn->m64_single_mode)
+		return max(align, (resource_size_t)phb->ioda.m64_segsize);
 
-	return align;
+	return pdn->vfs_expanded * align;
 }
 #endif /* CONFIG_PCI_IOV */
 
@@ -3180,6 +3211,7 @@ static void pnv_pci_ioda_shutdown(struct pci_controller *hose)
 
 static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
        .dma_dev_setup = pnv_pci_dma_dev_setup,
+       .dma_bus_setup = pnv_pci_dma_bus_setup,
 #ifdef CONFIG_PCI_MSI
        .setup_msi_irqs = pnv_setup_msi_irqs,
        .teardown_msi_irqs = pnv_teardown_msi_irqs,
diff --git a/arch/powerpc/platforms/powernv/pci-p5ioc2.c b/arch/powerpc/platforms/powernv/pci-p5ioc2.c
deleted file mode 100644
index f2bdfea3b68d..000000000000
--- a/arch/powerpc/platforms/powernv/pci-p5ioc2.c
+++ /dev/null
@@ -1,271 +0,0 @@
-/*
- * Support PCI/PCIe on PowerNV platforms
- *
- * Currently supports only P5IOC2
- *
- * Copyright 2011 Benjamin Herrenschmidt, IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/kernel.h>
-#include <linux/pci.h>
-#include <linux/delay.h>
-#include <linux/string.h>
-#include <linux/init.h>
-#include <linux/bootmem.h>
-#include <linux/irq.h>
-#include <linux/io.h>
-#include <linux/msi.h>
-
-#include <asm/sections.h>
-#include <asm/io.h>
-#include <asm/prom.h>
-#include <asm/pci-bridge.h>
-#include <asm/machdep.h>
-#include <asm/msi_bitmap.h>
-#include <asm/ppc-pci.h>
-#include <asm/opal.h>
-#include <asm/iommu.h>
-#include <asm/tce.h>
-
-#include "powernv.h"
-#include "pci.h"
-
-/* For now, use a fixed amount of TCE memory for each p5ioc2
- * hub, 16M will do
- */
-#define P5IOC2_TCE_MEMORY	0x01000000
-
-#ifdef CONFIG_PCI_MSI
-static int pnv_pci_p5ioc2_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
-				    unsigned int hwirq, unsigned int virq,
-				    unsigned int is_64, struct msi_msg *msg)
-{
-	if (WARN_ON(!is_64))
-		return -ENXIO;
-	msg->data = hwirq - phb->msi_base;
-	msg->address_hi = 0x10000000;
-	msg->address_lo = 0;
-
-	return 0;
-}
-
-static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb)
-{
-	unsigned int count;
-	const __be32 *prop = of_get_property(phb->hose->dn,
-					     "ibm,opal-msi-ranges", NULL);
-	if (!prop)
-		return;
-
-	/* Don't do MSI's on p5ioc2 PCI-X are they are not properly
-	 * verified in HW
-	 */
-	if (of_device_is_compatible(phb->hose->dn, "ibm,p5ioc2-pcix"))
-		return;
-	phb->msi_base = be32_to_cpup(prop);
-	count = be32_to_cpup(prop + 1);
-	if (msi_bitmap_alloc(&phb->msi_bmp, count, phb->hose->dn)) {
-		pr_err("PCI %d: Failed to allocate MSI bitmap !\n",
-		       phb->hose->global_number);
-		return;
-	}
-	phb->msi_setup = pnv_pci_p5ioc2_msi_setup;
-	phb->msi32_support = 0;
-	pr_info(" Allocated bitmap for %d MSIs (base IRQ 0x%x)\n",
-		count, phb->msi_base);
-}
-#else
-static void pnv_pci_init_p5ioc2_msis(struct pnv_phb *phb) { }
-#endif /* CONFIG_PCI_MSI */
-
-static struct iommu_table_ops pnv_p5ioc2_iommu_ops = {
-	.set = pnv_tce_build,
-#ifdef CONFIG_IOMMU_API
-	.exchange = pnv_tce_xchg,
-#endif
-	.clear = pnv_tce_free,
-	.get = pnv_tce_get,
-};
-
-static void pnv_pci_p5ioc2_dma_dev_setup(struct pnv_phb *phb,
-					 struct pci_dev *pdev)
-{
-	struct iommu_table *tbl = phb->p5ioc2.table_group.tables[0];
-
-	if (!tbl->it_map) {
-		tbl->it_ops = &pnv_p5ioc2_iommu_ops;
-		iommu_init_table(tbl, phb->hose->node);
-		iommu_register_group(&phb->p5ioc2.table_group,
-				pci_domain_nr(phb->hose->bus), phb->opal_id);
-		INIT_LIST_HEAD_RCU(&tbl->it_group_list);
-		pnv_pci_link_table_and_group(phb->hose->node, 0,
-				tbl, &phb->p5ioc2.table_group);
-	}
-
-	set_iommu_table_base(&pdev->dev, tbl);
-	iommu_add_device(&pdev->dev);
-}
-
-static const struct pci_controller_ops pnv_pci_p5ioc2_controller_ops = {
-	.dma_dev_setup = pnv_pci_dma_dev_setup,
-#ifdef CONFIG_PCI_MSI
-       .setup_msi_irqs = pnv_setup_msi_irqs,
-       .teardown_msi_irqs = pnv_teardown_msi_irqs,
-#endif
-};
-
-static void __init pnv_pci_init_p5ioc2_phb(struct device_node *np, u64 hub_id,
-					   void *tce_mem, u64 tce_size)
-{
-	struct pnv_phb *phb;
-	const __be64 *prop64;
-	u64 phb_id;
-	int64_t rc;
-	static int primary = 1;
-	struct iommu_table_group *table_group;
-	struct iommu_table *tbl;
-
-	pr_info(" Initializing p5ioc2 PHB %s\n", np->full_name);
-
-	prop64 = of_get_property(np, "ibm,opal-phbid", NULL);
-	if (!prop64) {
-		pr_err("  Missing \"ibm,opal-phbid\" property !\n");
-		return;
-	}
-	phb_id = be64_to_cpup(prop64);
-	pr_devel("  PHB-ID  : 0x%016llx\n", phb_id);
-	pr_devel("  TCE AT  : 0x%016lx\n", __pa(tce_mem));
-	pr_devel("  TCE SZ  : 0x%016llx\n", tce_size);
-
-	rc = opal_pci_set_phb_tce_memory(phb_id, __pa(tce_mem), tce_size);
-	if (rc != OPAL_SUCCESS) {
-		pr_err("  Failed to set TCE memory, OPAL error %lld\n", rc);
-		return;
-	}
-
-	phb = memblock_virt_alloc(sizeof(struct pnv_phb), 0);
-	phb->hose = pcibios_alloc_controller(np);
-	if (!phb->hose) {
-		pr_err("  Failed to allocate PCI controller\n");
-		return;
-	}
-
-	spin_lock_init(&phb->lock);
-	phb->hose->first_busno = 0;
-	phb->hose->last_busno = 0xff;
-	phb->hose->private_data = phb;
-	phb->hose->controller_ops = pnv_pci_p5ioc2_controller_ops;
-	phb->hub_id = hub_id;
-	phb->opal_id = phb_id;
-	phb->type = PNV_PHB_P5IOC2;
-	phb->model = PNV_PHB_MODEL_P5IOC2;
-
-	phb->regs = of_iomap(np, 0);
-
-	if (phb->regs == NULL)
-		pr_err("  Failed to map registers !\n");
-	else {
-		pr_devel("  P_BUID     = 0x%08x\n", in_be32(phb->regs + 0x100));
-		pr_devel("  P_IOSZ     = 0x%08x\n", in_be32(phb->regs + 0x1b0));
-		pr_devel("  P_IO_ST    = 0x%08x\n", in_be32(phb->regs + 0x1e0));
-		pr_devel("  P_MEM1_H   = 0x%08x\n", in_be32(phb->regs + 0x1a0));
-		pr_devel("  P_MEM1_L   = 0x%08x\n", in_be32(phb->regs + 0x190));
-		pr_devel("  P_MSZ1_L   = 0x%08x\n", in_be32(phb->regs + 0x1c0));
-		pr_devel("  P_MEM_ST   = 0x%08x\n", in_be32(phb->regs + 0x1d0));
-		pr_devel("  P_MEM2_H   = 0x%08x\n", in_be32(phb->regs + 0x2c0));
-		pr_devel("  P_MEM2_L   = 0x%08x\n", in_be32(phb->regs + 0x2b0));
-		pr_devel("  P_MSZ2_H   = 0x%08x\n", in_be32(phb->regs + 0x2d0));
-		pr_devel("  P_MSZ2_L   = 0x%08x\n", in_be32(phb->regs + 0x2e0));
-	}
-
-	/* Interpret the "ranges" property */
-	/* This also maps the I/O region and sets isa_io/mem_base */
-	pci_process_bridge_OF_ranges(phb->hose, np, primary);
-	primary = 0;
-
-	phb->hose->ops = &pnv_pci_ops;
-
-	/* Setup MSI support */
-	pnv_pci_init_p5ioc2_msis(phb);
-
-	/* Setup TCEs */
-	phb->dma_dev_setup = pnv_pci_p5ioc2_dma_dev_setup;
-	pnv_pci_setup_iommu_table(&phb->p5ioc2.iommu_table,
-				  tce_mem, tce_size, 0,
-				  IOMMU_PAGE_SHIFT_4K);
-	/*
-	 * We do not allocate iommu_table as we do not support
-	 * hotplug or SRIOV on P5IOC2 and therefore iommu_free_table()
-	 * should not be called for phb->p5ioc2.table_group.tables[0] ever.
-	 */
-	tbl = phb->p5ioc2.table_group.tables[0] = &phb->p5ioc2.iommu_table;
-	table_group = &phb->p5ioc2.table_group;
-	table_group->tce32_start = tbl->it_offset << tbl->it_page_shift;
-	table_group->tce32_size = tbl->it_size << tbl->it_page_shift;
-}
-
-void __init pnv_pci_init_p5ioc2_hub(struct device_node *np)
-{
-	struct device_node *phbn;
-	const __be64 *prop64;
-	u64 hub_id;
-	void *tce_mem;
-	uint64_t tce_per_phb;
-	int64_t rc;
-	int phb_count = 0;
-
-	pr_info("Probing p5ioc2 IO-Hub %s\n", np->full_name);
-
-	prop64 = of_get_property(np, "ibm,opal-hubid", NULL);
-	if (!prop64) {
-		pr_err(" Missing \"ibm,opal-hubid\" property !\n");
-		return;
-	}
-	hub_id = be64_to_cpup(prop64);
-	pr_info(" HUB-ID : 0x%016llx\n", hub_id);
-
-	/* Count child PHBs and calculate TCE space per PHB */
-	for_each_child_of_node(np, phbn) {
-		if (of_device_is_compatible(phbn, "ibm,p5ioc2-pcix") ||
-		    of_device_is_compatible(phbn, "ibm,p5ioc2-pciex"))
-			phb_count++;
-	}
-
-	if (phb_count <= 0) {
-		pr_info(" No PHBs for Hub %s\n", np->full_name);
-		return;
-	}
-
-	tce_per_phb = __rounddown_pow_of_two(P5IOC2_TCE_MEMORY / phb_count);
-	pr_info(" Allocating %lld MB of TCE memory per PHB\n",
-		tce_per_phb >> 20);
-
-	/* Currently allocate 16M of TCE memory for every Hub
-	 *
-	 * XXX TODO: Make it chip local if possible
-	 */
-	tce_mem = memblock_virt_alloc(P5IOC2_TCE_MEMORY, P5IOC2_TCE_MEMORY);
-	pr_debug(" TCE    : 0x%016lx..0x%016lx\n",
-		__pa(tce_mem), __pa(tce_mem) + P5IOC2_TCE_MEMORY - 1);
-	rc = opal_pci_set_hub_tce_memory(hub_id, __pa(tce_mem),
-					P5IOC2_TCE_MEMORY);
-	if (rc != OPAL_SUCCESS) {
-		pr_err(" Failed to allocate TCE memory, OPAL error %lld\n", rc);
-		return;
-	}
-
-	/* Initialize PHBs */
-	for_each_child_of_node(np, phbn) {
-		if (of_device_is_compatible(phbn, "ibm,p5ioc2-pcix") ||
-		    of_device_is_compatible(phbn, "ibm,p5ioc2-pciex")) {
-			pnv_pci_init_p5ioc2_phb(phbn, hub_id,
-					tce_mem, tce_per_phb);
-			tce_mem += tce_per_phb;
-		}
-	}
-}
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 2f55c86df703..73c8dc2a353f 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -380,10 +380,7 @@ static void pnv_pci_config_check_eeh(struct pci_dn *pdn)
 	 */
 	pe_no = pdn->pe_number;
 	if (pe_no == IODA_INVALID_PE) {
-		if (phb->type == PNV_PHB_P5IOC2)
-			pe_no = 0;
-		else
-			pe_no = phb->ioda.reserved_pe;
+		pe_no = phb->ioda.reserved_pe;
 	}
 
 	/*
@@ -599,6 +596,9 @@ int pnv_tce_build(struct iommu_table *tbl, long index, long npages,
 	u64 rpn = __pa(uaddr) >> tbl->it_page_shift;
 	long i;
 
+	if (proto_tce & TCE_PCI_WRITE)
+		proto_tce |= TCE_PCI_READ;
+
 	for (i = 0; i < npages; i++) {
 		unsigned long newtce = proto_tce |
 			((rpn + i) << tbl->it_page_shift);
@@ -620,6 +620,9 @@ int pnv_tce_xchg(struct iommu_table *tbl, long index,
 
 	BUG_ON(*hpa & ~IOMMU_PAGE_MASK(tbl));
 
+	if (newtce & TCE_PCI_WRITE)
+		newtce |= TCE_PCI_READ;
+
 	oldtce = xchg(pnv_tce(tbl, idx), cpu_to_be64(newtce));
 	*hpa = be64_to_cpu(oldtce) & ~(TCE_PCI_READ | TCE_PCI_WRITE);
 	*direction = iommu_tce_direction(oldtce);
@@ -760,6 +763,26 @@ void pnv_pci_dma_dev_setup(struct pci_dev *pdev)
 		phb->dma_dev_setup(phb, pdev);
 }
 
+void pnv_pci_dma_bus_setup(struct pci_bus *bus)
+{
+	struct pci_controller *hose = bus->sysdata;
+	struct pnv_phb *phb = hose->private_data;
+	struct pnv_ioda_pe *pe;
+
+	list_for_each_entry(pe, &phb->ioda.pe_list, list) {
+		if (!(pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)))
+			continue;
+
+		if (!pe->pbus)
+			continue;
+
+		if (bus->number == ((pe->rid >> 8) & 0xFF)) {
+			pe->pbus = bus;
+			break;
+		}
+	}
+}
+
 void pnv_pci_shutdown(void)
 {
 	struct pci_controller *hose;
@@ -779,7 +802,6 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_IBM, 0x3b9, pnv_p7ioc_rc_quirk);
 void __init pnv_pci_init(void)
 {
 	struct device_node *np;
-	bool found_ioda = false;
 
 	pci_add_flags(PCI_CAN_SKIP_ISA_ALIGN);
 
@@ -787,20 +809,11 @@ void __init pnv_pci_init(void)
 	if (!firmware_has_feature(FW_FEATURE_OPAL))
 		return;
 
-	/* Look for IODA IO-Hubs. We don't support mixing IODA
-	 * and p5ioc2 due to the need to change some global
-	 * probing flags
-	 */
+	/* Look for IODA IO-Hubs. */
 	for_each_compatible_node(np, NULL, "ibm,ioda-hub") {
 		pnv_pci_init_ioda_hub(np);
-		found_ioda = true;
 	}
 
-	/* Look for p5ioc2 IO-Hubs */
-	if (!found_ioda)
-		for_each_compatible_node(np, NULL, "ibm,p5ioc2")
-			pnv_pci_init_p5ioc2_hub(np);
-
 	/* Look for ioda2 built-in PHB3's */
 	for_each_compatible_node(np, NULL, "ibm,ioda2-phb")
 		pnv_pci_init_ioda2_phb(np);
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index 7f56313e8d72..3f814f382b2e 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -4,16 +4,14 @@
 struct pci_dn;
 
 enum pnv_phb_type {
-	PNV_PHB_P5IOC2	= 0,
-	PNV_PHB_IODA1	= 1,
-	PNV_PHB_IODA2	= 2,
-	PNV_PHB_NPU	= 3,
+	PNV_PHB_IODA1	= 0,
+	PNV_PHB_IODA2	= 1,
+	PNV_PHB_NPU	= 2,
 };
 
 /* Precise PHB model for error management */
 enum pnv_phb_model {
 	PNV_PHB_MODEL_UNKNOWN,
-	PNV_PHB_MODEL_P5IOC2,
 	PNV_PHB_MODEL_P7IOC,
 	PNV_PHB_MODEL_PHB3,
 	PNV_PHB_MODEL_NPU,
@@ -121,81 +119,74 @@ struct pnv_phb {
 	void (*freeze_pe)(struct pnv_phb *phb, int pe_no);
 	int (*unfreeze_pe)(struct pnv_phb *phb, int pe_no, int opt);
 
-	union {
-		struct {
-			struct iommu_table iommu_table;
-			struct iommu_table_group table_group;
-		} p5ioc2;
-
-		struct {
-			/* Global bridge info */
-			unsigned int		total_pe;
-			unsigned int		reserved_pe;
-
-			/* 32-bit MMIO window */
-			unsigned int		m32_size;
-			unsigned int		m32_segsize;
-			unsigned int		m32_pci_base;
-
-			/* 64-bit MMIO window */
-			unsigned int		m64_bar_idx;
-			unsigned long		m64_size;
-			unsigned long		m64_segsize;
-			unsigned long		m64_base;
-			unsigned long		m64_bar_alloc;
-
-			/* IO ports */
-			unsigned int		io_size;
-			unsigned int		io_segsize;
-			unsigned int		io_pci_base;
-
-			/* PE allocation bitmap */
-			unsigned long		*pe_alloc;
-			/* PE allocation mutex */
-			struct mutex		pe_alloc_mutex;
-
-			/* M32 & IO segment maps */
-			unsigned int		*m32_segmap;
-			unsigned int		*io_segmap;
-			struct pnv_ioda_pe	*pe_array;
-
-			/* IRQ chip */
-			int			irq_chip_init;
-			struct irq_chip		irq_chip;
-
-			/* Sorted list of used PE's based
-			 * on the sequence of creation
-			 */
-			struct list_head	pe_list;
-			struct mutex            pe_list_mutex;
-
-			/* Reverse map of PEs, will have to extend if
-			 * we are to support more than 256 PEs, indexed
-			 * bus { bus, devfn }
-			 */
-			unsigned char		pe_rmap[0x10000];
-
-			/* 32-bit TCE tables allocation */
-			unsigned long		tce32_count;
-
-			/* Total "weight" for the sake of DMA resources
-			 * allocation
-			 */
-			unsigned int		dma_weight;
-			unsigned int		dma_pe_count;
-
-			/* Sorted list of used PE's, sorted at
-			 * boot for resource allocation purposes
-			 */
-			struct list_head	pe_dma_list;
-
-			/* TCE cache invalidate registers (physical and
-			 * remapped)
-			 */
-			phys_addr_t		tce_inval_reg_phys;
-			__be64 __iomem		*tce_inval_reg;
-		} ioda;
-	};
+	struct {
+		/* Global bridge info */
+		unsigned int		total_pe;
+		unsigned int		reserved_pe;
+
+		/* 32-bit MMIO window */
+		unsigned int		m32_size;
+		unsigned int		m32_segsize;
+		unsigned int		m32_pci_base;
+
+		/* 64-bit MMIO window */
+		unsigned int		m64_bar_idx;
+		unsigned long		m64_size;
+		unsigned long		m64_segsize;
+		unsigned long		m64_base;
+		unsigned long		m64_bar_alloc;
+
+		/* IO ports */
+		unsigned int		io_size;
+		unsigned int		io_segsize;
+		unsigned int		io_pci_base;
+
+		/* PE allocation bitmap */
+		unsigned long		*pe_alloc;
+		/* PE allocation mutex */
+		struct mutex		pe_alloc_mutex;
+
+		/* M32 & IO segment maps */
+		unsigned int		*m32_segmap;
+		unsigned int		*io_segmap;
+		struct pnv_ioda_pe	*pe_array;
+
+		/* IRQ chip */
+		int			irq_chip_init;
+		struct irq_chip		irq_chip;
+
+		/* Sorted list of used PE's based
+		 * on the sequence of creation
+		 */
+		struct list_head	pe_list;
+		struct mutex            pe_list_mutex;
+
+		/* Reverse map of PEs, will have to extend if
+		 * we are to support more than 256 PEs, indexed
+		 * bus { bus, devfn }
+		 */
+		unsigned char		pe_rmap[0x10000];
+
+		/* 32-bit TCE tables allocation */
+		unsigned long		tce32_count;
+
+		/* Total "weight" for the sake of DMA resources
+		 * allocation
+		 */
+		unsigned int		dma_weight;
+		unsigned int		dma_pe_count;
+
+		/* Sorted list of used PE's, sorted at
+		 * boot for resource allocation purposes
+		 */
+		struct list_head	pe_dma_list;
+
+		/* TCE cache invalidate registers (physical and
+		 * remapped)
+		 */
+		phys_addr_t		tce_inval_reg_phys;
+		__be64 __iomem		*tce_inval_reg;
+	} ioda;
 
 	/* PHB and hub status structure */
 	union {
@@ -232,7 +223,6 @@ extern void pnv_pci_unlink_table_and_group(struct iommu_table *tbl,
 extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
 				      void *tce_mem, u64 tce_size,
 				      u64 dma_offset, unsigned page_shift);
-extern void pnv_pci_init_p5ioc2_hub(struct device_node *np);
 extern void pnv_pci_init_ioda_hub(struct device_node *np);
 extern void pnv_pci_init_ioda2_phb(struct device_node *np);
 extern void pnv_pci_init_npu_phb(struct device_node *np);
@@ -242,6 +232,7 @@ extern void pnv_pci_reset_secondary_bus(struct pci_dev *dev);
 extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option);
 
 extern void pnv_pci_dma_dev_setup(struct pci_dev *pdev);
+extern void pnv_pci_dma_bus_setup(struct pci_bus *bus);
 extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type);
 extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
 
diff --git a/arch/powerpc/platforms/powernv/subcore.c b/arch/powerpc/platforms/powernv/subcore.c
index 503a73f59359..0babef11136f 100644
--- a/arch/powerpc/platforms/powernv/subcore.c
+++ b/arch/powerpc/platforms/powernv/subcore.c
@@ -407,7 +407,7 @@ static DEVICE_ATTR(subcores_per_core, 0644,
 
 static int subcore_init(void)
 {
-	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+	if (!cpu_has_feature(CPU_FTR_SUBCORE))
 		return 0;
 
 	/*
diff --git a/arch/powerpc/platforms/ps3/gelic_udbg.c b/arch/powerpc/platforms/ps3/gelic_udbg.c
index 20b46a19a48f..09bf24d616a5 100644
--- a/arch/powerpc/platforms/ps3/gelic_udbg.c
+++ b/arch/powerpc/platforms/ps3/gelic_udbg.c
@@ -13,6 +13,12 @@
  *
  */
 
+#include <linux/if_ether.h>
+#include <linux/etherdevice.h>
+#include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+
 #include <asm/io.h>
 #include <asm/udbg.h>
 #include <asm/lv1call.h>
@@ -56,39 +62,8 @@ struct debug_block {
 	u8 pkt[1520];
 } __packed;
 
-struct ethhdr {
-	u8 dest[6];
-	u8 src[6];
-	u16 type;
-} __packed;
-
-struct vlantag {
-	u16 vlan;
-	u16 subtype;
-} __packed;
-
-struct iphdr {
-	u8 ver_len;
-	u8 dscp_ecn;
-	u16 total_length;
-	u16 ident;
-	u16 frag_off_flags;
-	u8 ttl;
-	u8 proto;
-	u16 checksum;
-	u32 src;
-	u32 dest;
-} __packed;
-
-struct udphdr {
-	u16 src;
-	u16 dest;
-	u16 len;
-	u16 checksum;
-} __packed;
-
 static __iomem struct ethhdr *h_eth;
-static __iomem struct vlantag *h_vlan;
+static __iomem struct vlan_hdr *h_vlan;
 static __iomem struct iphdr *h_ip;
 static __iomem struct udphdr *h_udp;
 
@@ -173,8 +148,8 @@ static void gelic_debug_init(void)
 
 	h_eth = (struct ethhdr *)dbg.pkt;
 
-	memset(&h_eth->dest, 0xff, 6);
-	memcpy(&h_eth->src, &mac, 6);
+	eth_broadcast_addr(h_eth->h_dest);
+	memcpy(&h_eth->h_source, &mac, ETH_ALEN);
 
 	header_size = sizeof(struct ethhdr);
 
@@ -183,28 +158,29 @@ static void gelic_debug_init(void)
 				 GELIC_LV1_VLAN_TX_ETHERNET_0, 0, 0,
 				 &vlan_id, &v2);
 	if (!result) {
-		h_eth->type = 0x8100;
+		h_eth->h_proto= ETH_P_8021Q;
 
-		header_size += sizeof(struct vlantag);
-		h_vlan = (struct vlantag *)(h_eth + 1);
-		h_vlan->vlan = vlan_id;
-		h_vlan->subtype = 0x0800;
+		header_size += sizeof(struct vlan_hdr);
+		h_vlan = (struct vlan_hdr *)(h_eth + 1);
+		h_vlan->h_vlan_TCI = vlan_id;
+		h_vlan->h_vlan_encapsulated_proto = ETH_P_IP;
 		h_ip = (struct iphdr *)(h_vlan + 1);
 	} else {
-		h_eth->type = 0x0800;
+		h_eth->h_proto= 0x0800;
 		h_ip = (struct iphdr *)(h_eth + 1);
 	}
 
 	header_size += sizeof(struct iphdr);
-	h_ip->ver_len = 0x45;
+	h_ip->version = 4;
+	h_ip->ihl = 5;
 	h_ip->ttl = 10;
-	h_ip->proto = 0x11;
-	h_ip->src = 0x00000000;
-	h_ip->dest = 0xffffffff;
+	h_ip->protocol = 0x11;
+	h_ip->saddr = 0x00000000;
+	h_ip->daddr = 0xffffffff;
 
 	header_size += sizeof(struct udphdr);
 	h_udp = (struct udphdr *)(h_ip + 1);
-	h_udp->src = GELIC_DEBUG_PORT;
+	h_udp->source = GELIC_DEBUG_PORT;
 	h_udp->dest = GELIC_DEBUG_PORT;
 
 	pmsgc = pmsg = (char *)(h_udp + 1);
@@ -225,16 +201,16 @@ static void gelic_sendbuf(int msgsize)
 	int i;
 
 	dbg.descr.buf_size = header_size + msgsize;
-	h_ip->total_length = msgsize + sizeof(struct udphdr) +
+	h_ip->tot_len = msgsize + sizeof(struct udphdr) +
 			     sizeof(struct iphdr);
 	h_udp->len = msgsize + sizeof(struct udphdr);
 
-	h_ip->checksum = 0;
+	h_ip->check = 0;
 	sum = 0;
 	p = (u16 *)h_ip;
 	for (i = 0; i < 5; i++)
 		sum += *p++;
-	h_ip->checksum = ~(sum + (sum >> 16));
+	h_ip->check = ~(sum + (sum >> 16));
 
 	dbg.descr.dmac_cmd_status = GELIC_DESCR_DMA_CMD_NO_CHKSUM |
 				    GELIC_DESCR_TX_DMA_FRAME_TAIL;
diff --git a/arch/powerpc/platforms/ps3/interrupt.c b/arch/powerpc/platforms/ps3/interrupt.c
index 638c4060938e..b831638e6f4a 100644
--- a/arch/powerpc/platforms/ps3/interrupt.c
+++ b/arch/powerpc/platforms/ps3/interrupt.c
@@ -78,7 +78,7 @@ struct ps3_bmp {
 /**
  * struct ps3_private - a per cpu data structure
  * @bmp: ps3_bmp structure
- * @bmp_lock: Syncronize access to bmp.
+ * @bmp_lock: Synchronize access to bmp.
  * @ipi_debug_brk_mask: Mask for debug break IPIs
  * @ppe_id: HV logical_ppe_id
  * @thread_id: HV thread_id
diff --git a/arch/powerpc/platforms/pseries/hvconsole.c b/arch/powerpc/platforms/pseries/hvconsole.c
index 849b29b3e9ae..74da18de853a 100644
--- a/arch/powerpc/platforms/pseries/hvconsole.c
+++ b/arch/powerpc/platforms/pseries/hvconsole.c
@@ -31,7 +31,7 @@
 #include <asm/plpar_wrappers.h>
 
 /**
- * hvc_get_chars - retrieve characters from firmware for denoted vterm adatper
+ * hvc_get_chars - retrieve characters from firmware for denoted vterm adapter
  * @vtermno: The vtermno or unit_address of the adapter from which to fetch the
  *	data.
  * @buf: The character buffer into which to put the character data fetched from
diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c
index 477290ad855e..2415a0d31f8f 100644
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -505,8 +505,8 @@ static void pSeries_lpar_hugepage_invalidate(unsigned long vsid,
 }
 #endif
 
-static void pSeries_lpar_hpte_removebolted(unsigned long ea,
-					   int psize, int ssize)
+static int pSeries_lpar_hpte_removebolted(unsigned long ea,
+					  int psize, int ssize)
 {
 	unsigned long vpn;
 	unsigned long slot, vsid;
@@ -515,11 +515,14 @@ static void pSeries_lpar_hpte_removebolted(unsigned long ea,
 	vpn = hpt_vpn(ea, vsid, ssize);
 
 	slot = pSeries_lpar_hpte_find(vpn, psize, ssize);
-	BUG_ON(slot == -1);
+	if (slot == -1)
+		return -ENOENT;
+
 	/*
 	 * lpar doesn't use the passed actual page size
 	 */
 	pSeries_lpar_hpte_invalidate(slot, vpn, psize, 0, ssize, 0);
+	return 0;
 }
 
 /*
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 36df46eaba24..6e944fc6e5f9 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -515,7 +515,7 @@ static void __init pSeries_setup_arch(void)
 
 	fwnmi_init();
 
-	/* By default, only probe PCI (can be overriden by rtas_pci) */
+	/* By default, only probe PCI (can be overridden by rtas_pci) */
 	pci_add_flags(PCI_PROBE_ONLY);
 
 	/* Find and initialize PCI host bridges */
diff --git a/arch/powerpc/sysdev/fsl_pci.c b/arch/powerpc/sysdev/fsl_pci.c
index c69e88e91459..85729f49764f 100644
--- a/arch/powerpc/sysdev/fsl_pci.c
+++ b/arch/powerpc/sysdev/fsl_pci.c
@@ -575,7 +575,7 @@ int fsl_add_bridge(struct platform_device *pdev, int is_primary)
 	if (early_find_capability(hose, 0, 0, PCI_CAP_ID_EXP)) {
 		/* use fsl_indirect_read_config for PCIe */
 		hose->ops = &fsl_indirect_pcie_ops;
-		/* For PCIE read HEADER_TYPE to identify controler mode */
+		/* For PCIE read HEADER_TYPE to identify controller mode */
 		early_read_config_byte(hose, 0, 0, PCI_HEADER_TYPE, &hdr_type);
 		if ((hdr_type & 0x7f) != PCI_HEADER_TYPE_BRIDGE)
 			goto no_bridge;
diff --git a/arch/powerpc/sysdev/fsl_rmu.c b/arch/powerpc/sysdev/fsl_rmu.c
index b48197ae44d0..ffe0ee832768 100644
--- a/arch/powerpc/sysdev/fsl_rmu.c
+++ b/arch/powerpc/sysdev/fsl_rmu.c
@@ -570,7 +570,7 @@ int fsl_rio_port_write_init(struct fsl_rio_pw *pw)
 	out_be32(&pw->pw_regs->pwsr,
 		 (RIO_IPWSR_TE | RIO_IPWSR_QFI | RIO_IPWSR_PWD));
 
-	/* Configure port write contoller for snooping enable all reporting,
+	/* Configure port write controller for snooping enable all reporting,
 	   clear queue full */
 	out_be32(&pw->pw_regs->pwmr,
 		 RIO_IPWMR_SEN | RIO_IPWMR_QFIE | RIO_IPWMR_EIE | RIO_IPWMR_CQ);
diff --git a/arch/powerpc/sysdev/i8259.c b/arch/powerpc/sysdev/i8259.c
index 6f99ed3967fd..aa2c186d3115 100644
--- a/arch/powerpc/sysdev/i8259.c
+++ b/arch/powerpc/sysdev/i8259.c
@@ -238,7 +238,7 @@ void i8259_init(struct device_node *node, unsigned long intack_addr)
 	/* init master interrupt controller */
 	outb(0x11, 0x20); /* Start init sequence */
 	outb(0x00, 0x21); /* Vector base */
-	outb(0x04, 0x21); /* edge tiggered, Cascade (slave) on IRQ2 */
+	outb(0x04, 0x21); /* edge triggered, Cascade (slave) on IRQ2 */
 	outb(0x01, 0x21); /* Select 8086 mode */
 
 	/* init slave interrupt controller */
diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c
index 2a0452e364ba..afe3c7cd395d 100644
--- a/arch/powerpc/sysdev/mpic.c
+++ b/arch/powerpc/sysdev/mpic.c
@@ -2,7 +2,7 @@
  *  arch/powerpc/kernel/mpic.c
  *
  *  Driver for interrupt controllers following the OpenPIC standard, the
- *  common implementation beeing IBM's MPIC. This driver also can deal
+ *  common implementation being IBM's MPIC. This driver also can deal
  *  with various broken implementations of this HW.
  *
  *  Copyright (C) 2004 Benjamin Herrenschmidt, IBM Corp.
@@ -1657,7 +1657,7 @@ void __init mpic_init(struct mpic *mpic)
 		}
 	}
 
-	/* FSL mpic error interrupt intialization */
+	/* FSL mpic error interrupt initialization */
 	if (mpic->flags & MPIC_FSL_HAS_EIMR)
 		mpic_err_int_init(mpic, MPIC_FSL_ERR_INT);
 }
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index 07a8508cb7fa..942796fa4767 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -47,6 +47,9 @@
 #include <asm/debug.h>
 #include <asm/hw_breakpoint.h>
 
+#include <asm/opal.h>
+#include <asm/firmware.h>
+
 #ifdef CONFIG_PPC64
 #include <asm/hvcall.h>
 #include <asm/paca.h>
@@ -119,6 +122,16 @@ static void dump(void);
 static void prdump(unsigned long, long);
 static int ppc_inst_dump(unsigned long, long, int);
 static void dump_log_buf(void);
+
+#ifdef CONFIG_PPC_POWERNV
+static void dump_opal_msglog(void);
+#else
+static inline void dump_opal_msglog(void)
+{
+	printf("Machine is not running OPAL firmware.\n");
+}
+#endif
+
 static void backtrace(struct pt_regs *);
 static void excprint(struct pt_regs *);
 static void prregs(struct pt_regs *);
@@ -150,6 +163,7 @@ static int  cpu_cmd(void);
 static void csum(void);
 static void bootcmds(void);
 static void proccall(void);
+static void show_tasks(void);
 void dump_segments(void);
 static void symbol_lookup(void);
 static void xmon_show_stack(unsigned long sp, unsigned long lr,
@@ -202,6 +216,10 @@ Commands:\n\
   df	dump float values\n\
   dd	dump double values\n\
   dl    dump the kernel log buffer\n"
+#ifdef CONFIG_PPC_POWERNV
+  "\
+  do    dump the OPAL message log\n"
+#endif
 #ifdef CONFIG_PPC64
   "\
   dp[#]	dump paca for current cpu, or cpu #\n\
@@ -221,6 +239,7 @@ Commands:\n\
   mz	zero a block of memory\n\
   mi	show information about memory allocation\n\
   p 	call a procedure\n\
+  P 	list processes/tasks\n\
   r	print registers\n\
   s	single step\n"
 #ifdef CONFIG_SPU_BASE
@@ -233,7 +252,7 @@ Commands:\n\
 "  S	print special registers\n\
   t	print backtrace\n\
   x	exit monitor and recover\n\
-  X	exit monitor and dont recover\n"
+  X	exit monitor and don't recover\n"
 #if defined(CONFIG_PPC64) && !defined(CONFIG_PPC_BOOK3E)
 "  u	dump segment table or SLB\n"
 #elif defined(CONFIG_PPC_STD_MMU_32)
@@ -950,6 +969,9 @@ cmds(struct pt_regs *excp)
 		case 'p':
 			proccall();
 			break;
+		case 'P':
+			show_tasks();
+			break;
 #ifdef CONFIG_PPC_STD_MMU
 		case 'u':
 			dump_segments();
@@ -2253,6 +2275,8 @@ dump(void)
 		last_cmd = "di\n";
 	} else if (c == 'l') {
 		dump_log_buf();
+	} else if (c == 'o') {
+		dump_opal_msglog();
 	} else if (c == 'r') {
 		scanhex(&ndump);
 		if (ndump == 0)
@@ -2395,6 +2419,45 @@ dump_log_buf(void)
 	catch_memory_errors = 0;
 }
 
+#ifdef CONFIG_PPC_POWERNV
+static void dump_opal_msglog(void)
+{
+	unsigned char buf[128];
+	ssize_t res;
+	loff_t pos = 0;
+
+	if (!firmware_has_feature(FW_FEATURE_OPAL)) {
+		printf("Machine is not running OPAL firmware.\n");
+		return;
+	}
+
+	if (setjmp(bus_error_jmp) != 0) {
+		printf("Error dumping OPAL msglog!\n");
+		return;
+	}
+
+	catch_memory_errors = 1;
+	sync();
+
+	xmon_start_pagination();
+	while ((res = opal_msglog_copy(buf, pos, sizeof(buf) - 1))) {
+		if (res < 0) {
+			printf("Error dumping OPAL msglog! Error: %zd\n", res);
+			break;
+		}
+		buf[res] = '\0';
+		printf("%s", buf);
+		pos += res;
+	}
+	xmon_end_pagination();
+
+	sync();
+	/* wait a little while to see if we get a machine check */
+	__delay(200);
+	catch_memory_errors = 0;
+}
+#endif
+
 /*
  * Memory operations - move, set, print differences
  */
@@ -2508,6 +2571,61 @@ memzcan(void)
 		printf("%.8x\n", a - mskip);
 }
 
+static void show_task(struct task_struct *tsk)
+{
+	char state;
+
+	/*
+	 * Cloned from kdb_task_state_char(), which is not entirely
+	 * appropriate for calling from xmon. This could be moved
+	 * to a common, generic, routine used by both.
+	 */
+	state = (tsk->state == 0) ? 'R' :
+		(tsk->state < 0) ? 'U' :
+		(tsk->state & TASK_UNINTERRUPTIBLE) ? 'D' :
+		(tsk->state & TASK_STOPPED) ? 'T' :
+		(tsk->state & TASK_TRACED) ? 'C' :
+		(tsk->exit_state & EXIT_ZOMBIE) ? 'Z' :
+		(tsk->exit_state & EXIT_DEAD) ? 'E' :
+		(tsk->state & TASK_INTERRUPTIBLE) ? 'S' : '?';
+
+	printf("%p %016lx %6d %6d %c %2d %s\n", tsk,
+		tsk->thread.ksp,
+		tsk->pid, tsk->parent->pid,
+		state, task_thread_info(tsk)->cpu,
+		tsk->comm);
+}
+
+static void show_tasks(void)
+{
+	unsigned long tskv;
+	struct task_struct *tsk = NULL;
+
+	printf("     task_struct     ->thread.ksp    PID   PPID S  P CMD\n");
+
+	if (scanhex(&tskv))
+		tsk = (struct task_struct *)tskv;
+
+	if (setjmp(bus_error_jmp) != 0) {
+		catch_memory_errors = 0;
+		printf("*** Error dumping task %p\n", tsk);
+		return;
+	}
+
+	catch_memory_errors = 1;
+	sync();
+
+	if (tsk)
+		show_task(tsk);
+	else
+		for_each_process(tsk)
+			show_task(tsk);
+
+	sync();
+	__delay(200);
+	catch_memory_errors = 0;
+}
+
 static void proccall(void)
 {
 	unsigned long args[8];
diff --git a/drivers/misc/cxl/Makefile b/drivers/misc/cxl/Makefile
index be2ac5ce349f..8a55c1aa11aa 100644
--- a/drivers/misc/cxl/Makefile
+++ b/drivers/misc/cxl/Makefile
@@ -4,6 +4,7 @@ ccflags-$(CONFIG_PPC_WERROR)	+= -Werror
 cxl-y				+= main.o file.o irq.o fault.o native.o
 cxl-y				+= context.o sysfs.o debugfs.o pci.o trace.o
 cxl-y				+= vphb.o api.o
+cxl-$(CONFIG_PPC_PSERIES)	+= flash.o guest.o of.o hcalls.o
 obj-$(CONFIG_CXL)		+= cxl.o
 obj-$(CONFIG_CXL_BASE)		+= base.o
 
diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c
index ea3eeb7011e1..2107c948406d 100644
--- a/drivers/misc/cxl/api.c
+++ b/drivers/misc/cxl/api.c
@@ -51,8 +51,6 @@ struct cxl_context *cxl_dev_context_init(struct pci_dev *dev)
 	if (rc)
 		goto err_mapping;
 
-	cxl_assign_psn_space(ctx);
-
 	return ctx;
 
 err_mapping:
@@ -78,7 +76,6 @@ struct device *cxl_get_phys_dev(struct pci_dev *dev)
 
 	return afu->adapter->dev.parent;
 }
-EXPORT_SYMBOL_GPL(cxl_get_phys_dev);
 
 int cxl_release_context(struct cxl_context *ctx)
 {
@@ -91,28 +88,11 @@ int cxl_release_context(struct cxl_context *ctx)
 }
 EXPORT_SYMBOL_GPL(cxl_release_context);
 
-int cxl_allocate_afu_irqs(struct cxl_context *ctx, int num)
-{
-	if (num == 0)
-		num = ctx->afu->pp_irqs;
-	return afu_allocate_irqs(ctx, num);
-}
-EXPORT_SYMBOL_GPL(cxl_allocate_afu_irqs);
-
-void cxl_free_afu_irqs(struct cxl_context *ctx)
-{
-	afu_irq_name_free(ctx);
-	cxl_release_irq_ranges(&ctx->irqs, ctx->afu->adapter);
-}
-EXPORT_SYMBOL_GPL(cxl_free_afu_irqs);
-
 static irq_hw_number_t cxl_find_afu_irq(struct cxl_context *ctx, int num)
 {
 	__u16 range;
 	int r;
 
-	WARN_ON(num == 0);
-
 	for (r = 0; r < CXL_IRQ_RANGES; r++) {
 		range = ctx->irqs.range[r];
 		if (num < range) {
@@ -123,6 +103,44 @@ static irq_hw_number_t cxl_find_afu_irq(struct cxl_context *ctx, int num)
 	return 0;
 }
 
+int cxl_allocate_afu_irqs(struct cxl_context *ctx, int num)
+{
+	int res;
+	irq_hw_number_t hwirq;
+
+	if (num == 0)
+		num = ctx->afu->pp_irqs;
+	res = afu_allocate_irqs(ctx, num);
+	if (!res && !cpu_has_feature(CPU_FTR_HVMODE)) {
+		/* In a guest, the PSL interrupt is not multiplexed. It was
+		 * allocated above, and we need to set its handler
+		 */
+		hwirq = cxl_find_afu_irq(ctx, 0);
+		if (hwirq)
+			cxl_map_irq(ctx->afu->adapter, hwirq, cxl_ops->psl_interrupt, ctx, "psl");
+	}
+	return res;
+}
+EXPORT_SYMBOL_GPL(cxl_allocate_afu_irqs);
+
+void cxl_free_afu_irqs(struct cxl_context *ctx)
+{
+	irq_hw_number_t hwirq;
+	unsigned int virq;
+
+	if (!cpu_has_feature(CPU_FTR_HVMODE)) {
+		hwirq = cxl_find_afu_irq(ctx, 0);
+		if (hwirq) {
+			virq = irq_find_mapping(NULL, hwirq);
+			if (virq)
+				cxl_unmap_irq(virq, ctx);
+		}
+	}
+	afu_irq_name_free(ctx);
+	cxl_ops->release_irq_ranges(&ctx->irqs, ctx->afu->adapter);
+}
+EXPORT_SYMBOL_GPL(cxl_free_afu_irqs);
+
 int cxl_map_afu_irq(struct cxl_context *ctx, int num,
 		    irq_handler_t handler, void *cookie, char *name)
 {
@@ -178,7 +196,7 @@ int cxl_start_context(struct cxl_context *ctx, u64 wed,
 
 	cxl_ctx_get();
 
-	if ((rc = cxl_attach_process(ctx, kernel, wed , 0))) {
+	if ((rc = cxl_ops->attach_process(ctx, kernel, wed, 0))) {
 		put_pid(ctx->pid);
 		cxl_ctx_put();
 		goto out;
@@ -193,7 +211,7 @@ EXPORT_SYMBOL_GPL(cxl_start_context);
 
 int cxl_process_element(struct cxl_context *ctx)
 {
-	return ctx->pe;
+	return ctx->external_pe;
 }
 EXPORT_SYMBOL_GPL(cxl_process_element);
 
@@ -207,7 +225,6 @@ EXPORT_SYMBOL_GPL(cxl_stop_context);
 void cxl_set_master(struct cxl_context *ctx)
 {
 	ctx->master = true;
-	cxl_assign_psn_space(ctx);
 }
 EXPORT_SYMBOL_GPL(cxl_set_master);
 
@@ -325,15 +342,11 @@ EXPORT_SYMBOL_GPL(cxl_start_work);
 
 void __iomem *cxl_psa_map(struct cxl_context *ctx)
 {
-	struct cxl_afu *afu = ctx->afu;
-	int rc;
-
-	rc = cxl_afu_check_and_enable(afu);
-	if (rc)
+	if (ctx->status != STARTED)
 		return NULL;
 
 	pr_devel("%s: psn_phys%llx size:%llx\n",
-		 __func__, afu->psn_phys, afu->adapter->ps_size);
+		__func__, ctx->psn_phys, ctx->psn_size);
 	return ioremap(ctx->psn_phys, ctx->psn_size);
 }
 EXPORT_SYMBOL_GPL(cxl_psa_map);
@@ -349,11 +362,11 @@ int cxl_afu_reset(struct cxl_context *ctx)
 	struct cxl_afu *afu = ctx->afu;
 	int rc;
 
-	rc = __cxl_afu_reset(afu);
+	rc = cxl_ops->afu_reset(afu);
 	if (rc)
 		return rc;
 
-	return cxl_afu_check_and_enable(afu);
+	return cxl_ops->afu_check_and_enable(afu);
 }
 EXPORT_SYMBOL_GPL(cxl_afu_reset);
 
@@ -363,3 +376,11 @@ void cxl_perst_reloads_same_image(struct cxl_afu *afu,
 	afu->adapter->perst_same_image = perst_reloads_same_image;
 }
 EXPORT_SYMBOL_GPL(cxl_perst_reloads_same_image);
+
+ssize_t cxl_read_adapter_vpd(struct pci_dev *dev, void *buf, size_t count)
+{
+	struct cxl_afu *afu = cxl_pci_to_afu(dev);
+
+	return cxl_ops->read_adapter_vpd(afu->adapter, buf, count);
+}
+EXPORT_SYMBOL_GPL(cxl_read_adapter_vpd);
diff --git a/drivers/misc/cxl/base.c b/drivers/misc/cxl/base.c
index a9f0dd3255a2..9b90ec6c07cd 100644
--- a/drivers/misc/cxl/base.c
+++ b/drivers/misc/cxl/base.c
@@ -11,6 +11,7 @@
 #include <linux/rcupdate.h>
 #include <asm/errno.h>
 #include <misc/cxl-base.h>
+#include <linux/of_platform.h>
 #include "cxl.h"
 
 /* protected by rcu */
@@ -84,3 +85,34 @@ void unregister_cxl_calls(struct cxl_calls *calls)
 	synchronize_rcu();
 }
 EXPORT_SYMBOL_GPL(unregister_cxl_calls);
+
+int cxl_update_properties(struct device_node *dn,
+			  struct property *new_prop)
+{
+	return of_update_property(dn, new_prop);
+}
+EXPORT_SYMBOL_GPL(cxl_update_properties);
+
+static int __init cxl_base_init(void)
+{
+	struct device_node *np = NULL;
+	struct platform_device *dev;
+	int count = 0;
+
+	/*
+	 * Scan for compatible devices in guest only
+	 */
+	if (cpu_has_feature(CPU_FTR_HVMODE))
+		return 0;
+
+	while ((np = of_find_compatible_node(np, NULL,
+				     "ibm,coherent-platform-facility"))) {
+		dev = of_platform_device_create(np, NULL, NULL);
+		if (dev)
+			count++;
+	}
+	pr_devel("Found %d cxl device(s)\n", count);
+	return 0;
+}
+
+module_init(cxl_base_init);
diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c
index 262b88eac414..10370f280500 100644
--- a/drivers/misc/cxl/context.c
+++ b/drivers/misc/cxl/context.c
@@ -95,7 +95,12 @@ int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master,
 		return i;
 
 	ctx->pe = i;
-	ctx->elem = &ctx->afu->spa[i];
+	if (cpu_has_feature(CPU_FTR_HVMODE)) {
+		ctx->elem = &ctx->afu->native->spa[i];
+		ctx->external_pe = ctx->pe;
+	} else {
+		ctx->external_pe = -1; /* assigned when attaching */
+	}
 	ctx->pe_inserted = false;
 
 	/*
@@ -214,8 +219,8 @@ int __detach_context(struct cxl_context *ctx)
 	/* Only warn if we detached while the link was OK.
 	 * If detach fails when hw is down, we don't care.
 	 */
-	WARN_ON(cxl_detach_process(ctx) &&
-		cxl_adapter_link_ok(ctx->afu->adapter));
+	WARN_ON(cxl_ops->detach_process(ctx) &&
+		cxl_ops->link_ok(ctx->afu->adapter, ctx->afu));
 	flush_work(&ctx->fault_work); /* Only needed for dedicated process */
 
 	/* release the reference to the group leader and mm handling pid */
diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h
index a521bc72cec2..38e21cf7806e 100644
--- a/drivers/misc/cxl/cxl.h
+++ b/drivers/misc/cxl/cxl.h
@@ -324,6 +324,10 @@ static const cxl_p2n_reg_t CXL_PSL_WED_An     = {0x0A0};
 #define CXL_MODE_TIME_SLICED 0x4
 #define CXL_SUPPORTED_MODES (CXL_MODE_DEDICATED | CXL_MODE_DIRECTED)
 
+#define CXL_DEV_MINORS 13   /* 1 control + 4 AFUs * 3 (dedicated/master/shared) */
+#define CXL_CARD_MINOR(adapter) (adapter->adapter_num * CXL_DEV_MINORS)
+#define CXL_DEVT_ADAPTER(dev) (MINOR(dev) / CXL_DEV_MINORS)
+
 enum cxl_context_status {
 	CLOSED,
 	OPENED,
@@ -336,6 +340,12 @@ enum prefault_modes {
 	CXL_PREFAULT_ALL,
 };
 
+enum cxl_attrs {
+	CXL_ADAPTER_ATTRS,
+	CXL_AFU_MASTER_ATTRS,
+	CXL_AFU_ATTRS,
+};
+
 struct cxl_sste {
 	__be64 esid_data;
 	__be64 vsid_data;
@@ -344,18 +354,46 @@ struct cxl_sste {
 #define to_cxl_adapter(d) container_of(d, struct cxl, dev)
 #define to_cxl_afu(d) container_of(d, struct cxl_afu, dev)
 
-struct cxl_afu {
+struct cxl_afu_native {
+	void __iomem *p1n_mmio;
+	void __iomem *afu_desc_mmio;
 	irq_hw_number_t psl_hwirq;
+	unsigned int psl_virq;
+	struct mutex spa_mutex;
+	/*
+	 * Only the first part of the SPA is used for the process element
+	 * linked list. The only other part that software needs to worry about
+	 * is sw_command_status, which we store a separate pointer to.
+	 * Everything else in the SPA is only used by hardware
+	 */
+	struct cxl_process_element *spa;
+	__be64 *sw_command_status;
+	unsigned int spa_size;
+	int spa_order;
+	int spa_max_procs;
+	u64 pp_offset;
+};
+
+struct cxl_afu_guest {
+	u64 handle;
+	phys_addr_t p2n_phys;
+	u64 p2n_size;
+	int max_ints;
+	struct mutex recovery_lock;
+	int previous_state;
+};
+
+struct cxl_afu {
+	struct cxl_afu_native *native;
+	struct cxl_afu_guest *guest;
 	irq_hw_number_t serr_hwirq;
-	char *err_irq_name;
-	char *psl_irq_name;
 	unsigned int serr_virq;
-	void __iomem *p1n_mmio;
+	char *psl_irq_name;
+	char *err_irq_name;
 	void __iomem *p2n_mmio;
 	phys_addr_t psn_phys;
-	u64 pp_offset;
 	u64 pp_size;
-	void __iomem *afu_desc_mmio;
+
 	struct cxl *adapter;
 	struct device dev;
 	struct cdev afu_cdev_s, afu_cdev_m, afu_cdev_d;
@@ -363,26 +401,12 @@ struct cxl_afu {
 	struct idr contexts_idr;
 	struct dentry *debugfs;
 	struct mutex contexts_lock;
-	struct mutex spa_mutex;
 	spinlock_t afu_cntl_lock;
 
 	/* AFU error buffer fields and bin attribute for sysfs */
 	u64 eb_len, eb_offset;
 	struct bin_attribute attr_eb;
 
-	/*
-	 * Only the first part of the SPA is used for the process element
-	 * linked list. The only other part that software needs to worry about
-	 * is sw_command_status, which we store a separate pointer to.
-	 * Everything else in the SPA is only used by hardware
-	 */
-	struct cxl_process_element *spa;
-	__be64 *sw_command_status;
-	unsigned int spa_size;
-	int spa_order;
-	int spa_max_procs;
-	unsigned int psl_virq;
-
 	/* pointer to the vphb */
 	struct pci_controller *phb;
 
@@ -421,6 +445,12 @@ struct cxl_irq_name {
 	char *name;
 };
 
+struct irq_avail {
+	irq_hw_number_t offset;
+	irq_hw_number_t range;
+	unsigned long   *bitmap;
+};
+
 /*
  * This is a cxl context.  If the PSL is in dedicated mode, there will be one
  * of these per AFU.  If in AFU directed there can be lots of these.
@@ -476,7 +506,19 @@ struct cxl_context {
 
 	struct cxl_process_element *elem;
 
-	int pe; /* process element handle */
+	/*
+	 * pe is the process element handle, assigned by this driver when the
+	 * context is initialized.
+	 *
+	 * external_pe is the PE shown outside of cxl.
+	 * On bare-metal, pe=external_pe, because we decide what the handle is.
+	 * In a guest, we only find out about the pe used by pHyp when the
+	 * context is attached, and that's the value we want to report outside
+	 * of cxl.
+	 */
+	int pe;
+	int external_pe;
+
 	u32 irq_count;
 	bool pe_inserted;
 	bool master;
@@ -488,11 +530,34 @@ struct cxl_context {
 	struct rcu_head rcu;
 };
 
-struct cxl {
+struct cxl_native {
+	u64 afu_desc_off;
+	u64 afu_desc_size;
 	void __iomem *p1_mmio;
 	void __iomem *p2_mmio;
 	irq_hw_number_t err_hwirq;
 	unsigned int err_virq;
+	u64 ps_off;
+};
+
+struct cxl_guest {
+	struct platform_device *pdev;
+	int irq_nranges;
+	struct cdev cdev;
+	irq_hw_number_t irq_base_offset;
+	struct irq_avail *irq_avail;
+	spinlock_t irq_alloc_lock;
+	u64 handle;
+	char *status;
+	u16 vendor;
+	u16 device;
+	u16 subsystem_vendor;
+	u16 subsystem;
+};
+
+struct cxl {
+	struct cxl_native *native;
+	struct cxl_guest *guest;
 	spinlock_t afu_list_lock;
 	struct cxl_afu *afu[CXL_MAX_SLICES];
 	struct device dev;
@@ -503,9 +568,6 @@ struct cxl {
 	struct bin_attribute cxl_attr;
 	int adapter_num;
 	int user_irqs;
-	u64 afu_desc_off;
-	u64 afu_desc_size;
-	u64 ps_off;
 	u64 ps_size;
 	u16 psl_rev;
 	u16 base_image;
@@ -519,13 +581,15 @@ struct cxl {
 	bool perst_same_image;
 };
 
-int cxl_alloc_one_irq(struct cxl *adapter);
-void cxl_release_one_irq(struct cxl *adapter, int hwirq);
-int cxl_alloc_irq_ranges(struct cxl_irq_ranges *irqs, struct cxl *adapter, unsigned int num);
-void cxl_release_irq_ranges(struct cxl_irq_ranges *irqs, struct cxl *adapter);
-int cxl_setup_irq(struct cxl *adapter, unsigned int hwirq, unsigned int virq);
+int cxl_pci_alloc_one_irq(struct cxl *adapter);
+void cxl_pci_release_one_irq(struct cxl *adapter, int hwirq);
+int cxl_pci_alloc_irq_ranges(struct cxl_irq_ranges *irqs, struct cxl *adapter, unsigned int num);
+void cxl_pci_release_irq_ranges(struct cxl_irq_ranges *irqs, struct cxl *adapter);
+int cxl_pci_setup_irq(struct cxl *adapter, unsigned int hwirq, unsigned int virq);
 int cxl_update_image_control(struct cxl *adapter);
-int cxl_reset(struct cxl *adapter);
+int cxl_pci_reset(struct cxl *adapter);
+void cxl_pci_release_afu(struct device *dev);
+ssize_t cxl_pci_read_adapter_vpd(struct cxl *adapter, void *buf, size_t len);
 
 /* common == phyp + powernv */
 struct cxl_process_element_common {
@@ -555,29 +619,32 @@ struct cxl_process_element {
 	__be32 software_state;
 } __packed;
 
-static inline bool cxl_adapter_link_ok(struct cxl *cxl)
+static inline bool cxl_adapter_link_ok(struct cxl *cxl, struct cxl_afu *afu)
 {
 	struct pci_dev *pdev;
 
-	pdev = to_pci_dev(cxl->dev.parent);
-	return !pci_channel_offline(pdev);
+	if (cpu_has_feature(CPU_FTR_HVMODE)) {
+		pdev = to_pci_dev(cxl->dev.parent);
+		return !pci_channel_offline(pdev);
+	}
+	return true;
 }
 
 static inline void __iomem *_cxl_p1_addr(struct cxl *cxl, cxl_p1_reg_t reg)
 {
 	WARN_ON(!cpu_has_feature(CPU_FTR_HVMODE));
-	return cxl->p1_mmio + cxl_reg_off(reg);
+	return cxl->native->p1_mmio + cxl_reg_off(reg);
 }
 
 static inline void cxl_p1_write(struct cxl *cxl, cxl_p1_reg_t reg, u64 val)
 {
-	if (likely(cxl_adapter_link_ok(cxl)))
+	if (likely(cxl_adapter_link_ok(cxl, NULL)))
 		out_be64(_cxl_p1_addr(cxl, reg), val);
 }
 
 static inline u64 cxl_p1_read(struct cxl *cxl, cxl_p1_reg_t reg)
 {
-	if (likely(cxl_adapter_link_ok(cxl)))
+	if (likely(cxl_adapter_link_ok(cxl, NULL)))
 		return in_be64(_cxl_p1_addr(cxl, reg));
 	else
 		return ~0ULL;
@@ -586,18 +653,18 @@ static inline u64 cxl_p1_read(struct cxl *cxl, cxl_p1_reg_t reg)
 static inline void __iomem *_cxl_p1n_addr(struct cxl_afu *afu, cxl_p1n_reg_t reg)
 {
 	WARN_ON(!cpu_has_feature(CPU_FTR_HVMODE));
-	return afu->p1n_mmio + cxl_reg_off(reg);
+	return afu->native->p1n_mmio + cxl_reg_off(reg);
 }
 
 static inline void cxl_p1n_write(struct cxl_afu *afu, cxl_p1n_reg_t reg, u64 val)
 {
-	if (likely(cxl_adapter_link_ok(afu->adapter)))
+	if (likely(cxl_adapter_link_ok(afu->adapter, afu)))
 		out_be64(_cxl_p1n_addr(afu, reg), val);
 }
 
 static inline u64 cxl_p1n_read(struct cxl_afu *afu, cxl_p1n_reg_t reg)
 {
-	if (likely(cxl_adapter_link_ok(afu->adapter)))
+	if (likely(cxl_adapter_link_ok(afu->adapter, afu)))
 		return in_be64(_cxl_p1n_addr(afu, reg));
 	else
 		return ~0ULL;
@@ -610,39 +677,19 @@ static inline void __iomem *_cxl_p2n_addr(struct cxl_afu *afu, cxl_p2n_reg_t reg
 
 static inline void cxl_p2n_write(struct cxl_afu *afu, cxl_p2n_reg_t reg, u64 val)
 {
-	if (likely(cxl_adapter_link_ok(afu->adapter)))
+	if (likely(cxl_adapter_link_ok(afu->adapter, afu)))
 		out_be64(_cxl_p2n_addr(afu, reg), val);
 }
 
 static inline u64 cxl_p2n_read(struct cxl_afu *afu, cxl_p2n_reg_t reg)
 {
-	if (likely(cxl_adapter_link_ok(afu->adapter)))
+	if (likely(cxl_adapter_link_ok(afu->adapter, afu)))
 		return in_be64(_cxl_p2n_addr(afu, reg));
 	else
 		return ~0ULL;
 }
 
-static inline u64 cxl_afu_cr_read64(struct cxl_afu *afu, int cr, u64 off)
-{
-	if (likely(cxl_adapter_link_ok(afu->adapter)))
-		return in_le64((afu)->afu_desc_mmio + (afu)->crs_offset +
-			       ((cr) * (afu)->crs_len) + (off));
-	else
-		return ~0ULL;
-}
-
-static inline u32 cxl_afu_cr_read32(struct cxl_afu *afu, int cr, u64 off)
-{
-	if (likely(cxl_adapter_link_ok(afu->adapter)))
-		return in_le32((afu)->afu_desc_mmio + (afu)->crs_offset +
-			       ((cr) * (afu)->crs_len) + (off));
-	else
-		return 0xffffffff;
-}
-u16 cxl_afu_cr_read16(struct cxl_afu *afu, int cr, u64 off);
-u8 cxl_afu_cr_read8(struct cxl_afu *afu, int cr, u64 off);
-
-ssize_t cxl_afu_read_err_buffer(struct cxl_afu *afu, char *buf,
+ssize_t cxl_pci_afu_read_err_buffer(struct cxl_afu *afu, char *buf,
 				loff_t off, size_t count);
 
 
@@ -652,13 +699,14 @@ struct cxl_calls {
 };
 int register_cxl_calls(struct cxl_calls *calls);
 void unregister_cxl_calls(struct cxl_calls *calls);
+int cxl_update_properties(struct device_node *dn, struct property *new_prop);
 
-int cxl_alloc_adapter_nr(struct cxl *adapter);
 void cxl_remove_adapter_nr(struct cxl *adapter);
 
 int cxl_alloc_spa(struct cxl_afu *afu);
 void cxl_release_spa(struct cxl_afu *afu);
 
+dev_t cxl_get_dev(void);
 int cxl_file_init(void);
 void cxl_file_exit(void);
 int cxl_register_adapter(struct cxl *adapter);
@@ -679,21 +727,19 @@ void cxl_sysfs_afu_remove(struct cxl_afu *afu);
 int cxl_sysfs_afu_m_add(struct cxl_afu *afu);
 void cxl_sysfs_afu_m_remove(struct cxl_afu *afu);
 
-int cxl_afu_activate_mode(struct cxl_afu *afu, int mode);
-int _cxl_afu_deactivate_mode(struct cxl_afu *afu, int mode);
-int cxl_afu_deactivate_mode(struct cxl_afu *afu);
+struct cxl *cxl_alloc_adapter(void);
+struct cxl_afu *cxl_alloc_afu(struct cxl *adapter, int slice);
 int cxl_afu_select_best_mode(struct cxl_afu *afu);
 
-int cxl_register_psl_irq(struct cxl_afu *afu);
-void cxl_release_psl_irq(struct cxl_afu *afu);
-int cxl_register_psl_err_irq(struct cxl *adapter);
-void cxl_release_psl_err_irq(struct cxl *adapter);
-int cxl_register_serr_irq(struct cxl_afu *afu);
-void cxl_release_serr_irq(struct cxl_afu *afu);
+int cxl_native_register_psl_irq(struct cxl_afu *afu);
+void cxl_native_release_psl_irq(struct cxl_afu *afu);
+int cxl_native_register_psl_err_irq(struct cxl *adapter);
+void cxl_native_release_psl_err_irq(struct cxl *adapter);
+int cxl_native_register_serr_irq(struct cxl_afu *afu);
+void cxl_native_release_serr_irq(struct cxl_afu *afu);
 int afu_register_irqs(struct cxl_context *ctx, u32 count);
 void afu_release_irqs(struct cxl_context *ctx, void *cookie);
 void afu_irq_name_free(struct cxl_context *ctx);
-irqreturn_t cxl_slice_irq_err(int irq, void *data);
 
 int cxl_debugfs_init(void);
 void cxl_debugfs_exit(void);
@@ -707,6 +753,7 @@ void cxl_prefault(struct cxl_context *ctx, u64 wed);
 
 struct cxl *get_cxl_adapter(int num);
 int cxl_alloc_sst(struct cxl_context *ctx);
+void cxl_dump_debug_buffer(void *addr, size_t size);
 
 void init_cxl_native(void);
 
@@ -720,40 +767,54 @@ unsigned int cxl_map_irq(struct cxl *adapter, irq_hw_number_t hwirq,
 void cxl_unmap_irq(unsigned int virq, void *cookie);
 int __detach_context(struct cxl_context *ctx);
 
-/* This matches the layout of the H_COLLECT_CA_INT_INFO retbuf */
+/*
+ * This must match the layout of the H_COLLECT_CA_INT_INFO retbuf defined
+ * in PAPR.
+ * A word about endianness: a pointer to this structure is passed when
+ * calling the hcall. However, it is not a block of memory filled up by
+ * the hypervisor. The return values are found in registers, and copied
+ * one by one when returning from the hcall. See the end of the call to
+ * plpar_hcall9() in hvCall.S
+ * As a consequence:
+ * - we don't need to do any endianness conversion
+ * - the pid and tid are an exception. They are 32-bit values returned in
+ *   the same 64-bit register. So we do need to worry about byte ordering.
+ */
 struct cxl_irq_info {
 	u64 dsisr;
 	u64 dar;
 	u64 dsr;
+#ifndef CONFIG_CPU_LITTLE_ENDIAN
 	u32 pid;
 	u32 tid;
+#else
+	u32 tid;
+	u32 pid;
+#endif
 	u64 afu_err;
 	u64 errstat;
-	u64 padding[3]; /* to match the expected retbuf size for plpar_hcall9 */
+	u64 proc_handle;
+	u64 padding[2]; /* to match the expected retbuf size for plpar_hcall9 */
 };
 
 void cxl_assign_psn_space(struct cxl_context *ctx);
-int cxl_attach_process(struct cxl_context *ctx, bool kernel, u64 wed,
-			    u64 amr);
-int cxl_detach_process(struct cxl_context *ctx);
-
-int cxl_get_irq(struct cxl_afu *afu, struct cxl_irq_info *info);
-int cxl_ack_irq(struct cxl_context *ctx, u64 tfc, u64 psl_reset_mask);
+irqreturn_t cxl_irq(int irq, struct cxl_context *ctx, struct cxl_irq_info *irq_info);
+int cxl_register_one_irq(struct cxl *adapter, irq_handler_t handler,
+			void *cookie, irq_hw_number_t *dest_hwirq,
+			unsigned int *dest_virq, const char *name);
 
 int cxl_check_error(struct cxl_afu *afu);
 int cxl_afu_slbia(struct cxl_afu *afu);
 int cxl_tlb_slb_invalidate(struct cxl *adapter);
 int cxl_afu_disable(struct cxl_afu *afu);
-int __cxl_afu_reset(struct cxl_afu *afu);
-int cxl_afu_check_and_enable(struct cxl_afu *afu);
 int cxl_psl_purge(struct cxl_afu *afu);
 
 void cxl_stop_trace(struct cxl *cxl);
 int cxl_pci_vphb_add(struct cxl_afu *afu);
-void cxl_pci_vphb_reconfigure(struct cxl_afu *afu);
 void cxl_pci_vphb_remove(struct cxl_afu *afu);
 
 extern struct pci_driver cxl_pci_driver;
+extern struct platform_driver cxl_of_driver;
 int afu_allocate_irqs(struct cxl_context *ctx, u32 count);
 
 int afu_open(struct inode *inode, struct file *file);
@@ -764,4 +825,61 @@ unsigned int afu_poll(struct file *file, struct poll_table_struct *poll);
 ssize_t afu_read(struct file *file, char __user *buf, size_t count, loff_t *off);
 extern const struct file_operations afu_fops;
 
+struct cxl *cxl_guest_init_adapter(struct device_node *np, struct platform_device *dev);
+void cxl_guest_remove_adapter(struct cxl *adapter);
+int cxl_of_read_adapter_handle(struct cxl *adapter, struct device_node *np);
+int cxl_of_read_adapter_properties(struct cxl *adapter, struct device_node *np);
+ssize_t cxl_guest_read_adapter_vpd(struct cxl *adapter, void *buf, size_t len);
+ssize_t cxl_guest_read_afu_vpd(struct cxl_afu *afu, void *buf, size_t len);
+int cxl_guest_init_afu(struct cxl *adapter, int slice, struct device_node *afu_np);
+void cxl_guest_remove_afu(struct cxl_afu *afu);
+int cxl_of_read_afu_handle(struct cxl_afu *afu, struct device_node *afu_np);
+int cxl_of_read_afu_properties(struct cxl_afu *afu, struct device_node *afu_np);
+int cxl_guest_add_chardev(struct cxl *adapter);
+void cxl_guest_remove_chardev(struct cxl *adapter);
+void cxl_guest_reload_module(struct cxl *adapter);
+int cxl_of_probe(struct platform_device *pdev);
+
+struct cxl_backend_ops {
+	struct module *module;
+	int (*adapter_reset)(struct cxl *adapter);
+	int (*alloc_one_irq)(struct cxl *adapter);
+	void (*release_one_irq)(struct cxl *adapter, int hwirq);
+	int (*alloc_irq_ranges)(struct cxl_irq_ranges *irqs,
+				struct cxl *adapter, unsigned int num);
+	void (*release_irq_ranges)(struct cxl_irq_ranges *irqs,
+				struct cxl *adapter);
+	int (*setup_irq)(struct cxl *adapter, unsigned int hwirq,
+			unsigned int virq);
+	irqreturn_t (*handle_psl_slice_error)(struct cxl_context *ctx,
+					u64 dsisr, u64 errstat);
+	irqreturn_t (*psl_interrupt)(int irq, void *data);
+	int (*ack_irq)(struct cxl_context *ctx, u64 tfc, u64 psl_reset_mask);
+	int (*attach_process)(struct cxl_context *ctx, bool kernel,
+			u64 wed, u64 amr);
+	int (*detach_process)(struct cxl_context *ctx);
+	bool (*support_attributes)(const char *attr_name, enum cxl_attrs type);
+	bool (*link_ok)(struct cxl *cxl, struct cxl_afu *afu);
+	void (*release_afu)(struct device *dev);
+	ssize_t (*afu_read_err_buffer)(struct cxl_afu *afu, char *buf,
+				loff_t off, size_t count);
+	int (*afu_check_and_enable)(struct cxl_afu *afu);
+	int (*afu_activate_mode)(struct cxl_afu *afu, int mode);
+	int (*afu_deactivate_mode)(struct cxl_afu *afu, int mode);
+	int (*afu_reset)(struct cxl_afu *afu);
+	int (*afu_cr_read8)(struct cxl_afu *afu, int cr_idx, u64 offset, u8 *val);
+	int (*afu_cr_read16)(struct cxl_afu *afu, int cr_idx, u64 offset, u16 *val);
+	int (*afu_cr_read32)(struct cxl_afu *afu, int cr_idx, u64 offset, u32 *val);
+	int (*afu_cr_read64)(struct cxl_afu *afu, int cr_idx, u64 offset, u64 *val);
+	int (*afu_cr_write8)(struct cxl_afu *afu, int cr_idx, u64 offset, u8 val);
+	int (*afu_cr_write16)(struct cxl_afu *afu, int cr_idx, u64 offset, u16 val);
+	int (*afu_cr_write32)(struct cxl_afu *afu, int cr_idx, u64 offset, u32 val);
+	ssize_t (*read_adapter_vpd)(struct cxl *adapter, void *buf, size_t count);
+};
+extern const struct cxl_backend_ops cxl_native_ops;
+extern const struct cxl_backend_ops cxl_guest_ops;
+extern const struct cxl_backend_ops *cxl_ops;
+
+/* check if the given pci_dev is on the the cxl vphb bus */
+bool cxl_pci_is_vphb_device(struct pci_dev *dev);
 #endif
diff --git a/drivers/misc/cxl/debugfs.c b/drivers/misc/cxl/debugfs.c
index 18df6f44af2a..5751899e0c17 100644
--- a/drivers/misc/cxl/debugfs.c
+++ b/drivers/misc/cxl/debugfs.c
@@ -118,6 +118,10 @@ void cxl_debugfs_afu_remove(struct cxl_afu *afu)
 int __init cxl_debugfs_init(void)
 {
 	struct dentry *ent;
+
+	if (!cpu_has_feature(CPU_FTR_HVMODE))
+		return 0;
+
 	ent = debugfs_create_dir("cxl", NULL);
 	if (IS_ERR(ent))
 		return PTR_ERR(ent);
diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c
index 81c3f75b7330..9a8650bcb042 100644
--- a/drivers/misc/cxl/fault.c
+++ b/drivers/misc/cxl/fault.c
@@ -101,7 +101,7 @@ static void cxl_ack_ae(struct cxl_context *ctx)
 {
 	unsigned long flags;
 
-	cxl_ack_irq(ctx, CXL_PSL_TFC_An_AE, 0);
+	cxl_ops->ack_irq(ctx, CXL_PSL_TFC_An_AE, 0);
 
 	spin_lock_irqsave(&ctx->lock, flags);
 	ctx->pending_fault = true;
@@ -125,7 +125,7 @@ static int cxl_handle_segment_miss(struct cxl_context *ctx,
 	else {
 
 		mb(); /* Order seg table write to TFC MMIO write */
-		cxl_ack_irq(ctx, CXL_PSL_TFC_An_R, 0);
+		cxl_ops->ack_irq(ctx, CXL_PSL_TFC_An_R, 0);
 	}
 
 	return IRQ_HANDLED;
@@ -163,7 +163,7 @@ static void cxl_handle_page_fault(struct cxl_context *ctx,
 	local_irq_restore(flags);
 
 	pr_devel("Page fault successfully handled for pe: %i!\n", ctx->pe);
-	cxl_ack_irq(ctx, CXL_PSL_TFC_An_R, 0);
+	cxl_ops->ack_irq(ctx, CXL_PSL_TFC_An_R, 0);
 }
 
 /*
@@ -254,14 +254,17 @@ void cxl_handle_fault(struct work_struct *fault_work)
 	u64 dar = ctx->dar;
 	struct mm_struct *mm = NULL;
 
-	if (cxl_p2n_read(ctx->afu, CXL_PSL_DSISR_An) != dsisr ||
-	    cxl_p2n_read(ctx->afu, CXL_PSL_DAR_An) != dar ||
-	    cxl_p2n_read(ctx->afu, CXL_PSL_PEHandle_An) != ctx->pe) {
-		/* Most likely explanation is harmless - a dedicated process
-		 * has detached and these were cleared by the PSL purge, but
-		 * warn about it just in case */
-		dev_notice(&ctx->afu->dev, "cxl_handle_fault: Translation fault regs changed\n");
-		return;
+	if (cpu_has_feature(CPU_FTR_HVMODE)) {
+		if (cxl_p2n_read(ctx->afu, CXL_PSL_DSISR_An) != dsisr ||
+		    cxl_p2n_read(ctx->afu, CXL_PSL_DAR_An) != dar ||
+		    cxl_p2n_read(ctx->afu, CXL_PSL_PEHandle_An) != ctx->pe) {
+			/* Most likely explanation is harmless - a dedicated
+			 * process has detached and these were cleared by the
+			 * PSL purge, but warn about it just in case
+			 */
+			dev_notice(&ctx->afu->dev, "cxl_handle_fault: Translation fault regs changed\n");
+			return;
+		}
 	}
 
 	/* Early return if the context is being / has been detached */
diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c
index 783337d22f36..eec468f1612f 100644
--- a/drivers/misc/cxl/file.c
+++ b/drivers/misc/cxl/file.c
@@ -26,9 +26,7 @@
 #include "trace.h"
 
 #define CXL_NUM_MINORS 256 /* Total to reserve */
-#define CXL_DEV_MINORS 13   /* 1 control + 4 AFUs * 3 (dedicated/master/shared) */
 
-#define CXL_CARD_MINOR(adapter) (adapter->adapter_num * CXL_DEV_MINORS)
 #define CXL_AFU_MINOR_D(afu) (CXL_CARD_MINOR(afu->adapter) + 1 + (3 * afu->slice))
 #define CXL_AFU_MINOR_M(afu) (CXL_AFU_MINOR_D(afu) + 1)
 #define CXL_AFU_MINOR_S(afu) (CXL_AFU_MINOR_D(afu) + 2)
@@ -36,7 +34,6 @@
 #define CXL_AFU_MKDEV_M(afu) MKDEV(MAJOR(cxl_dev), CXL_AFU_MINOR_M(afu))
 #define CXL_AFU_MKDEV_S(afu) MKDEV(MAJOR(cxl_dev), CXL_AFU_MINOR_S(afu))
 
-#define CXL_DEVT_ADAPTER(dev) (MINOR(dev) / CXL_DEV_MINORS)
 #define CXL_DEVT_AFU(dev) ((MINOR(dev) % CXL_DEV_MINORS - 1) / 3)
 
 #define CXL_DEVT_IS_CARD(dev) (MINOR(dev) % CXL_DEV_MINORS == 0)
@@ -79,7 +76,7 @@ static int __afu_open(struct inode *inode, struct file *file, bool master)
 	if (!afu->current_mode)
 		goto err_put_afu;
 
-	if (!cxl_adapter_link_ok(adapter)) {
+	if (!cxl_ops->link_ok(adapter, afu)) {
 		rc = -EIO;
 		goto err_put_afu;
 	}
@@ -210,8 +207,8 @@ static long afu_ioctl_start_work(struct cxl_context *ctx,
 
 	trace_cxl_attach(ctx, work.work_element_descriptor, work.num_interrupts, amr);
 
-	if ((rc = cxl_attach_process(ctx, false, work.work_element_descriptor,
-				     amr))) {
+	if ((rc = cxl_ops->attach_process(ctx, false, work.work_element_descriptor,
+							amr))) {
 		afu_release_irqs(ctx, ctx);
 		goto out;
 	}
@@ -222,12 +219,13 @@ out:
 	mutex_unlock(&ctx->status_mutex);
 	return rc;
 }
+
 static long afu_ioctl_process_element(struct cxl_context *ctx,
 				      int __user *upe)
 {
 	pr_devel("%s: pe: %i\n", __func__, ctx->pe);
 
-	if (copy_to_user(upe, &ctx->pe, sizeof(__u32)))
+	if (copy_to_user(upe, &ctx->external_pe, sizeof(__u32)))
 		return -EFAULT;
 
 	return 0;
@@ -259,7 +257,7 @@ long afu_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	if (ctx->status == CLOSED)
 		return -EIO;
 
-	if (!cxl_adapter_link_ok(ctx->afu->adapter))
+	if (!cxl_ops->link_ok(ctx->afu->adapter, ctx->afu))
 		return -EIO;
 
 	pr_devel("afu_ioctl\n");
@@ -289,7 +287,7 @@ int afu_mmap(struct file *file, struct vm_area_struct *vm)
 	if (ctx->status != STARTED)
 		return -EIO;
 
-	if (!cxl_adapter_link_ok(ctx->afu->adapter))
+	if (!cxl_ops->link_ok(ctx->afu->adapter, ctx->afu))
 		return -EIO;
 
 	return cxl_context_iomap(ctx, vm);
@@ -336,7 +334,7 @@ ssize_t afu_read(struct file *file, char __user *buf, size_t count,
 	int rc;
 	DEFINE_WAIT(wait);
 
-	if (!cxl_adapter_link_ok(ctx->afu->adapter))
+	if (!cxl_ops->link_ok(ctx->afu->adapter, ctx->afu))
 		return -EIO;
 
 	if (count < CXL_READ_MIN_SIZE)
@@ -349,7 +347,7 @@ ssize_t afu_read(struct file *file, char __user *buf, size_t count,
 		if (ctx_event_pending(ctx))
 			break;
 
-		if (!cxl_adapter_link_ok(ctx->afu->adapter)) {
+		if (!cxl_ops->link_ok(ctx->afu->adapter, ctx->afu)) {
 			rc = -EIO;
 			goto out;
 		}
@@ -445,7 +443,8 @@ static const struct file_operations afu_master_fops = {
 
 static char *cxl_devnode(struct device *dev, umode_t *mode)
 {
-	if (CXL_DEVT_IS_CARD(dev->devt)) {
+	if (cpu_has_feature(CPU_FTR_HVMODE) &&
+	    CXL_DEVT_IS_CARD(dev->devt)) {
 		/*
 		 * These minor numbers will eventually be used to program the
 		 * PSL and AFUs once we have dynamic reprogramming support
@@ -546,6 +545,11 @@ int cxl_register_adapter(struct cxl *adapter)
 	return device_register(&adapter->dev);
 }
 
+dev_t cxl_get_dev(void)
+{
+	return cxl_dev;
+}
+
 int __init cxl_file_init(void)
 {
 	int rc;
diff --git a/drivers/misc/cxl/flash.c b/drivers/misc/cxl/flash.c
new file mode 100644
index 000000000000..68dd0b7da471
--- /dev/null
+++ b/drivers/misc/cxl/flash.c
@@ -0,0 +1,538 @@
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/semaphore.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <asm/rtas.h>
+
+#include "cxl.h"
+#include "hcalls.h"
+
+#define DOWNLOAD_IMAGE 1
+#define VALIDATE_IMAGE 2
+
+struct ai_header {
+	u16 version;
+	u8  reserved0[6];
+	u16 vendor;
+	u16 device;
+	u16 subsystem_vendor;
+	u16 subsystem;
+	u64 image_offset;
+	u64 image_length;
+	u8  reserved1[96];
+};
+
+static struct semaphore sem;
+unsigned long *buffer[CXL_AI_MAX_ENTRIES];
+struct sg_list *le;
+static u64 continue_token;
+static unsigned int transfer;
+
+struct update_props_workarea {
+	__be32 phandle;
+	__be32 state;
+	__be64 reserved;
+	__be32 nprops;
+} __packed;
+
+struct update_nodes_workarea {
+	__be32 state;
+	__be64 unit_address;
+	__be32 reserved;
+} __packed;
+
+#define DEVICE_SCOPE 3
+#define NODE_ACTION_MASK	0xff000000
+#define NODE_COUNT_MASK		0x00ffffff
+#define OPCODE_DELETE	0x01000000
+#define OPCODE_UPDATE	0x02000000
+#define OPCODE_ADD	0x03000000
+
+static int rcall(int token, char *buf, s32 scope)
+{
+	int rc;
+
+	spin_lock(&rtas_data_buf_lock);
+
+	memcpy(rtas_data_buf, buf, RTAS_DATA_BUF_SIZE);
+	rc = rtas_call(token, 2, 1, NULL, rtas_data_buf, scope);
+	memcpy(buf, rtas_data_buf, RTAS_DATA_BUF_SIZE);
+
+	spin_unlock(&rtas_data_buf_lock);
+	return rc;
+}
+
+static int update_property(struct device_node *dn, const char *name,
+			   u32 vd, char *value)
+{
+	struct property *new_prop;
+	u32 *val;
+	int rc;
+
+	new_prop = kzalloc(sizeof(*new_prop), GFP_KERNEL);
+	if (!new_prop)
+		return -ENOMEM;
+
+	new_prop->name = kstrdup(name, GFP_KERNEL);
+	if (!new_prop->name) {
+		kfree(new_prop);
+		return -ENOMEM;
+	}
+
+	new_prop->length = vd;
+	new_prop->value = kzalloc(new_prop->length, GFP_KERNEL);
+	if (!new_prop->value) {
+		kfree(new_prop->name);
+		kfree(new_prop);
+		return -ENOMEM;
+	}
+	memcpy(new_prop->value, value, vd);
+
+	val = (u32 *)new_prop->value;
+	rc = cxl_update_properties(dn, new_prop);
+	pr_devel("%s: update property (%s, length: %i, value: %#x)\n",
+		  dn->name, name, vd, be32_to_cpu(*val));
+
+	if (rc) {
+		kfree(new_prop->name);
+		kfree(new_prop->value);
+		kfree(new_prop);
+	}
+	return rc;
+}
+
+static int update_node(__be32 phandle, s32 scope)
+{
+	struct update_props_workarea *upwa;
+	struct device_node *dn;
+	int i, rc, ret;
+	char *prop_data;
+	char *buf;
+	int token;
+	u32 nprops;
+	u32 vd;
+
+	token = rtas_token("ibm,update-properties");
+	if (token == RTAS_UNKNOWN_SERVICE)
+		return -EINVAL;
+
+	buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	dn = of_find_node_by_phandle(be32_to_cpu(phandle));
+	if (!dn) {
+		kfree(buf);
+		return -ENOENT;
+	}
+
+	upwa = (struct update_props_workarea *)&buf[0];
+	upwa->phandle = phandle;
+	do {
+		rc = rcall(token, buf, scope);
+		if (rc < 0)
+			break;
+
+		prop_data = buf + sizeof(*upwa);
+		nprops = be32_to_cpu(upwa->nprops);
+
+		if (*prop_data == 0) {
+			prop_data++;
+			vd = be32_to_cpu(*(__be32 *)prop_data);
+			prop_data += vd + sizeof(vd);
+			nprops--;
+		}
+
+		for (i = 0; i < nprops; i++) {
+			char *prop_name;
+
+			prop_name = prop_data;
+			prop_data += strlen(prop_name) + 1;
+			vd = be32_to_cpu(*(__be32 *)prop_data);
+			prop_data += sizeof(vd);
+
+			if ((vd != 0x00000000) && (vd != 0x80000000)) {
+				ret = update_property(dn, prop_name, vd,
+						prop_data);
+				if (ret)
+					pr_err("cxl: Could not update property %s - %i\n",
+					       prop_name, ret);
+
+				prop_data += vd;
+			}
+		}
+	} while (rc == 1);
+
+	of_node_put(dn);
+	kfree(buf);
+	return rc;
+}
+
+static int update_devicetree(struct cxl *adapter, s32 scope)
+{
+	struct update_nodes_workarea *unwa;
+	u32 action, node_count;
+	int token, rc, i;
+	__be32 *data, drc_index, phandle;
+	char *buf;
+
+	token = rtas_token("ibm,update-nodes");
+	if (token == RTAS_UNKNOWN_SERVICE)
+		return -EINVAL;
+
+	buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	unwa = (struct update_nodes_workarea *)&buf[0];
+	unwa->unit_address = cpu_to_be64(adapter->guest->handle);
+	do {
+		rc = rcall(token, buf, scope);
+		if (rc && rc != 1)
+			break;
+
+		data = (__be32 *)buf + 4;
+		while (be32_to_cpu(*data) & NODE_ACTION_MASK) {
+			action = be32_to_cpu(*data) & NODE_ACTION_MASK;
+			node_count = be32_to_cpu(*data) & NODE_COUNT_MASK;
+			pr_devel("device reconfiguration - action: %#x, nodes: %#x\n",
+				 action, node_count);
+			data++;
+
+			for (i = 0; i < node_count; i++) {
+				phandle = *data++;
+
+				switch (action) {
+				case OPCODE_DELETE:
+					/* nothing to do */
+					break;
+				case OPCODE_UPDATE:
+					update_node(phandle, scope);
+					break;
+				case OPCODE_ADD:
+					/* nothing to do, just move pointer */
+					drc_index = *data++;
+					break;
+				}
+			}
+		}
+	} while (rc == 1);
+
+	kfree(buf);
+	return 0;
+}
+
+static int handle_image(struct cxl *adapter, int operation,
+			long (*fct)(u64, u64, u64, u64 *),
+			struct cxl_adapter_image *ai)
+{
+	size_t mod, s_copy, len_chunk = 0;
+	struct ai_header *header = NULL;
+	unsigned int entries = 0, i;
+	void *dest, *from;
+	int rc = 0, need_header;
+
+	/* base adapter image header */
+	need_header = (ai->flags & CXL_AI_NEED_HEADER);
+	if (need_header) {
+		header = kzalloc(sizeof(struct ai_header), GFP_KERNEL);
+		if (!header)
+			return -ENOMEM;
+		header->version = cpu_to_be16(1);
+		header->vendor = cpu_to_be16(adapter->guest->vendor);
+		header->device = cpu_to_be16(adapter->guest->device);
+		header->subsystem_vendor = cpu_to_be16(adapter->guest->subsystem_vendor);
+		header->subsystem = cpu_to_be16(adapter->guest->subsystem);
+		header->image_offset = cpu_to_be64(CXL_AI_HEADER_SIZE);
+		header->image_length = cpu_to_be64(ai->len_image);
+	}
+
+	/* number of entries in the list */
+	len_chunk = ai->len_data;
+	if (need_header)
+		len_chunk += CXL_AI_HEADER_SIZE;
+
+	entries = len_chunk / CXL_AI_BUFFER_SIZE;
+	mod = len_chunk % CXL_AI_BUFFER_SIZE;
+	if (mod)
+		entries++;
+
+	if (entries > CXL_AI_MAX_ENTRIES) {
+		rc = -EINVAL;
+		goto err;
+	}
+
+	/*          < -- MAX_CHUNK_SIZE = 4096 * 256 = 1048576 bytes -->
+	 * chunk 0  ----------------------------------------------------
+	 *          | header   |  data                                 |
+	 *          ----------------------------------------------------
+	 * chunk 1  ----------------------------------------------------
+	 *          | data                                             |
+	 *          ----------------------------------------------------
+	 * ....
+	 * chunk n  ----------------------------------------------------
+	 *          | data                                             |
+	 *          ----------------------------------------------------
+	 */
+	from = (void *) ai->data;
+	for (i = 0; i < entries; i++) {
+		dest = buffer[i];
+		s_copy = CXL_AI_BUFFER_SIZE;
+
+		if ((need_header) && (i == 0)) {
+			/* add adapter image header */
+			memcpy(buffer[i], header, sizeof(struct ai_header));
+			s_copy = CXL_AI_BUFFER_SIZE - CXL_AI_HEADER_SIZE;
+			dest += CXL_AI_HEADER_SIZE; /* image offset */
+		}
+		if ((i == (entries - 1)) && mod)
+			s_copy = mod;
+
+		/* copy data */
+		if (copy_from_user(dest, from, s_copy))
+			goto err;
+
+		/* fill in the list */
+		le[i].phys_addr = cpu_to_be64(virt_to_phys(buffer[i]));
+		le[i].len = cpu_to_be64(CXL_AI_BUFFER_SIZE);
+		if ((i == (entries - 1)) && mod)
+			le[i].len = cpu_to_be64(mod);
+		from += s_copy;
+	}
+	pr_devel("%s (op: %i, need header: %i, entries: %i, token: %#llx)\n",
+		 __func__, operation, need_header, entries, continue_token);
+
+	/*
+	 * download/validate the adapter image to the coherent
+	 * platform facility
+	 */
+	rc = fct(adapter->guest->handle, virt_to_phys(le), entries,
+		&continue_token);
+	if (rc == 0) /* success of download/validation operation */
+		continue_token = 0;
+
+err:
+	kfree(header);
+
+	return rc;
+}
+
+static int transfer_image(struct cxl *adapter, int operation,
+			struct cxl_adapter_image *ai)
+{
+	int rc = 0;
+	int afu;
+
+	switch (operation) {
+	case DOWNLOAD_IMAGE:
+		rc = handle_image(adapter, operation,
+				&cxl_h_download_adapter_image, ai);
+		if (rc < 0) {
+			pr_devel("resetting adapter\n");
+			cxl_h_reset_adapter(adapter->guest->handle);
+		}
+		return rc;
+
+	case VALIDATE_IMAGE:
+		rc = handle_image(adapter, operation,
+				&cxl_h_validate_adapter_image, ai);
+		if (rc < 0) {
+			pr_devel("resetting adapter\n");
+			cxl_h_reset_adapter(adapter->guest->handle);
+			return rc;
+		}
+		if (rc == 0) {
+			pr_devel("remove curent afu\n");
+			for (afu = 0; afu < adapter->slices; afu++)
+				cxl_guest_remove_afu(adapter->afu[afu]);
+
+			pr_devel("resetting adapter\n");
+			cxl_h_reset_adapter(adapter->guest->handle);
+
+			/* The entire image has now been
+			 * downloaded and the validation has
+			 * been successfully performed.
+			 * After that, the partition should call
+			 * ibm,update-nodes and
+			 * ibm,update-properties to receive the
+			 * current configuration
+			 */
+			rc = update_devicetree(adapter, DEVICE_SCOPE);
+			transfer = 1;
+		}
+		return rc;
+	}
+
+	return -EINVAL;
+}
+
+static long ioctl_transfer_image(struct cxl *adapter, int operation,
+				struct cxl_adapter_image __user *uai)
+{
+	struct cxl_adapter_image ai;
+
+	pr_devel("%s\n", __func__);
+
+	if (copy_from_user(&ai, uai, sizeof(struct cxl_adapter_image)))
+		return -EFAULT;
+
+	/*
+	 * Make sure reserved fields and bits are set to 0
+	 */
+	if (ai.reserved1 || ai.reserved2 || ai.reserved3 || ai.reserved4 ||
+		(ai.flags & ~CXL_AI_ALL))
+		return -EINVAL;
+
+	return transfer_image(adapter, operation, &ai);
+}
+
+static int device_open(struct inode *inode, struct file *file)
+{
+	int adapter_num = CXL_DEVT_ADAPTER(inode->i_rdev);
+	struct cxl *adapter;
+	int rc = 0, i;
+
+	pr_devel("in %s\n", __func__);
+
+	BUG_ON(sizeof(struct ai_header) != CXL_AI_HEADER_SIZE);
+
+	/* Allows one process to open the device by using a semaphore */
+	if (down_interruptible(&sem) != 0)
+		return -EPERM;
+
+	if (!(adapter = get_cxl_adapter(adapter_num)))
+		return -ENODEV;
+
+	file->private_data = adapter;
+	continue_token = 0;
+	transfer = 0;
+
+	for (i = 0; i < CXL_AI_MAX_ENTRIES; i++)
+		buffer[i] = NULL;
+
+	/* aligned buffer containing list entries which describes up to
+	 * 1 megabyte of data (256 entries of 4096 bytes each)
+	 *  Logical real address of buffer 0  -  Buffer 0 length in bytes
+	 *  Logical real address of buffer 1  -  Buffer 1 length in bytes
+	 *  Logical real address of buffer 2  -  Buffer 2 length in bytes
+	 *  ....
+	 *  ....
+	 *  Logical real address of buffer N  -  Buffer N length in bytes
+	 */
+	le = (struct sg_list *)get_zeroed_page(GFP_KERNEL);
+	if (!le) {
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	for (i = 0; i < CXL_AI_MAX_ENTRIES; i++) {
+		buffer[i] = (unsigned long *)get_zeroed_page(GFP_KERNEL);
+		if (!buffer[i]) {
+			rc = -ENOMEM;
+			goto err1;
+		}
+	}
+
+	return 0;
+
+err1:
+	for (i = 0; i < CXL_AI_MAX_ENTRIES; i++) {
+		if (buffer[i])
+			free_page((unsigned long) buffer[i]);
+	}
+
+	if (le)
+		free_page((unsigned long) le);
+err:
+	put_device(&adapter->dev);
+
+	return rc;
+}
+
+static long device_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct cxl *adapter = file->private_data;
+
+	pr_devel("in %s\n", __func__);
+
+	if (cmd == CXL_IOCTL_DOWNLOAD_IMAGE)
+		return ioctl_transfer_image(adapter,
+					DOWNLOAD_IMAGE,
+					(struct cxl_adapter_image __user *)arg);
+	else if (cmd == CXL_IOCTL_VALIDATE_IMAGE)
+		return ioctl_transfer_image(adapter,
+					VALIDATE_IMAGE,
+					(struct cxl_adapter_image __user *)arg);
+	else
+		return -EINVAL;
+}
+
+static long device_compat_ioctl(struct file *file, unsigned int cmd,
+				unsigned long arg)
+{
+	return device_ioctl(file, cmd, arg);
+}
+
+static int device_close(struct inode *inode, struct file *file)
+{
+	struct cxl *adapter = file->private_data;
+	int i;
+
+	pr_devel("in %s\n", __func__);
+
+	for (i = 0; i < CXL_AI_MAX_ENTRIES; i++) {
+		if (buffer[i])
+			free_page((unsigned long) buffer[i]);
+	}
+
+	if (le)
+		free_page((unsigned long) le);
+
+	up(&sem);
+	put_device(&adapter->dev);
+	continue_token = 0;
+
+	/* reload the module */
+	if (transfer)
+		cxl_guest_reload_module(adapter);
+	else {
+		pr_devel("resetting adapter\n");
+		cxl_h_reset_adapter(adapter->guest->handle);
+	}
+
+	transfer = 0;
+	return 0;
+}
+
+static const struct file_operations fops = {
+	.owner		= THIS_MODULE,
+	.open		= device_open,
+	.unlocked_ioctl	= device_ioctl,
+	.compat_ioctl	= device_compat_ioctl,
+	.release	= device_close,
+};
+
+void cxl_guest_remove_chardev(struct cxl *adapter)
+{
+	cdev_del(&adapter->guest->cdev);
+}
+
+int cxl_guest_add_chardev(struct cxl *adapter)
+{
+	dev_t devt;
+	int rc;
+
+	devt = MKDEV(MAJOR(cxl_get_dev()), CXL_CARD_MINOR(adapter));
+	cdev_init(&adapter->guest->cdev, &fops);
+	if ((rc = cdev_add(&adapter->guest->cdev, devt, 1))) {
+		dev_err(&adapter->dev,
+			"Unable to add chardev on adapter (card%i): %i\n",
+			adapter->adapter_num, rc);
+		goto err;
+	}
+	adapter->dev.devt = devt;
+	sema_init(&sem, 1);
+err:
+	return rc;
+}
diff --git a/drivers/misc/cxl/guest.c b/drivers/misc/cxl/guest.c
new file mode 100644
index 000000000000..8213372de2b7
--- /dev/null
+++ b/drivers/misc/cxl/guest.c
@@ -0,0 +1,1177 @@
+/*
+ * Copyright 2015 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/uaccess.h>
+#include <linux/delay.h>
+
+#include "cxl.h"
+#include "hcalls.h"
+#include "trace.h"
+
+#define CXL_ERROR_DETECTED_EVENT	1
+#define CXL_SLOT_RESET_EVENT		2
+#define CXL_RESUME_EVENT		3
+
+static void pci_error_handlers(struct cxl_afu *afu,
+				int bus_error_event,
+				pci_channel_state_t state)
+{
+	struct pci_dev *afu_dev;
+
+	if (afu->phb == NULL)
+		return;
+
+	list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) {
+		if (!afu_dev->driver)
+			continue;
+
+		switch (bus_error_event) {
+		case CXL_ERROR_DETECTED_EVENT:
+			afu_dev->error_state = state;
+
+			if (afu_dev->driver->err_handler &&
+			    afu_dev->driver->err_handler->error_detected)
+				afu_dev->driver->err_handler->error_detected(afu_dev, state);
+		break;
+		case CXL_SLOT_RESET_EVENT:
+			afu_dev->error_state = state;
+
+			if (afu_dev->driver->err_handler &&
+			    afu_dev->driver->err_handler->slot_reset)
+				afu_dev->driver->err_handler->slot_reset(afu_dev);
+		break;
+		case CXL_RESUME_EVENT:
+			if (afu_dev->driver->err_handler &&
+			    afu_dev->driver->err_handler->resume)
+				afu_dev->driver->err_handler->resume(afu_dev);
+		break;
+		}
+	}
+}
+
+static irqreturn_t guest_handle_psl_slice_error(struct cxl_context *ctx, u64 dsisr,
+					u64 errstat)
+{
+	pr_devel("in %s\n", __func__);
+	dev_crit(&ctx->afu->dev, "PSL ERROR STATUS: 0x%.16llx\n", errstat);
+
+	return cxl_ops->ack_irq(ctx, 0, errstat);
+}
+
+static ssize_t guest_collect_vpd(struct cxl *adapter, struct cxl_afu *afu,
+			void *buf, size_t len)
+{
+	unsigned int entries, mod;
+	unsigned long **vpd_buf = NULL;
+	struct sg_list *le;
+	int rc = 0, i, tocopy;
+	u64 out = 0;
+
+	if (buf == NULL)
+		return -EINVAL;
+
+	/* number of entries in the list */
+	entries = len / SG_BUFFER_SIZE;
+	mod = len % SG_BUFFER_SIZE;
+	if (mod)
+		entries++;
+
+	if (entries > SG_MAX_ENTRIES) {
+		entries = SG_MAX_ENTRIES;
+		len = SG_MAX_ENTRIES * SG_BUFFER_SIZE;
+		mod = 0;
+	}
+
+	vpd_buf = kzalloc(entries * sizeof(unsigned long *), GFP_KERNEL);
+	if (!vpd_buf)
+		return -ENOMEM;
+
+	le = (struct sg_list *)get_zeroed_page(GFP_KERNEL);
+	if (!le) {
+		rc = -ENOMEM;
+		goto err1;
+	}
+
+	for (i = 0; i < entries; i++) {
+		vpd_buf[i] = (unsigned long *)get_zeroed_page(GFP_KERNEL);
+		if (!vpd_buf[i]) {
+			rc = -ENOMEM;
+			goto err2;
+		}
+		le[i].phys_addr = cpu_to_be64(virt_to_phys(vpd_buf[i]));
+		le[i].len = cpu_to_be64(SG_BUFFER_SIZE);
+		if ((i == (entries - 1)) && mod)
+			le[i].len = cpu_to_be64(mod);
+	}
+
+	if (adapter)
+		rc = cxl_h_collect_vpd_adapter(adapter->guest->handle,
+					virt_to_phys(le), entries, &out);
+	else
+		rc = cxl_h_collect_vpd(afu->guest->handle, 0,
+				virt_to_phys(le), entries, &out);
+	pr_devel("length of available (entries: %i), vpd: %#llx\n",
+		entries, out);
+
+	if (!rc) {
+		/*
+		 * hcall returns in 'out' the size of available VPDs.
+		 * It fills the buffer with as much data as possible.
+		 */
+		if (out < len)
+			len = out;
+		rc = len;
+		if (out) {
+			for (i = 0; i < entries; i++) {
+				if (len < SG_BUFFER_SIZE)
+					tocopy = len;
+				else
+					tocopy = SG_BUFFER_SIZE;
+				memcpy(buf, vpd_buf[i], tocopy);
+				buf += tocopy;
+				len -= tocopy;
+			}
+		}
+	}
+err2:
+	for (i = 0; i < entries; i++) {
+		if (vpd_buf[i])
+			free_page((unsigned long) vpd_buf[i]);
+	}
+	free_page((unsigned long) le);
+err1:
+	kfree(vpd_buf);
+	return rc;
+}
+
+static int guest_get_irq_info(struct cxl_context *ctx, struct cxl_irq_info *info)
+{
+	return cxl_h_collect_int_info(ctx->afu->guest->handle, ctx->process_token, info);
+}
+
+static irqreturn_t guest_psl_irq(int irq, void *data)
+{
+	struct cxl_context *ctx = data;
+	struct cxl_irq_info irq_info;
+	int rc;
+
+	pr_devel("%d: received PSL interrupt %i\n", ctx->pe, irq);
+	rc = guest_get_irq_info(ctx, &irq_info);
+	if (rc) {
+		WARN(1, "Unable to get IRQ info: %i\n", rc);
+		return IRQ_HANDLED;
+	}
+
+	rc = cxl_irq(irq, ctx, &irq_info);
+	return rc;
+}
+
+static int afu_read_error_state(struct cxl_afu *afu, int *state_out)
+{
+	u64 state;
+	int rc = 0;
+
+	rc = cxl_h_read_error_state(afu->guest->handle, &state);
+	if (!rc) {
+		WARN_ON(state != H_STATE_NORMAL &&
+			state != H_STATE_DISABLE &&
+			state != H_STATE_TEMP_UNAVAILABLE &&
+			state != H_STATE_PERM_UNAVAILABLE);
+		*state_out = state & 0xffffffff;
+	}
+	return rc;
+}
+
+static irqreturn_t guest_slice_irq_err(int irq, void *data)
+{
+	struct cxl_afu *afu = data;
+	int rc;
+	u64 serr;
+
+	WARN(irq, "CXL SLICE ERROR interrupt %i\n", irq);
+	rc = cxl_h_get_fn_error_interrupt(afu->guest->handle, &serr);
+	if (rc) {
+		dev_crit(&afu->dev, "Couldn't read PSL_SERR_An: %d\n", rc);
+		return IRQ_HANDLED;
+	}
+	dev_crit(&afu->dev, "PSL_SERR_An: 0x%.16llx\n", serr);
+
+	rc = cxl_h_ack_fn_error_interrupt(afu->guest->handle, serr);
+	if (rc)
+		dev_crit(&afu->dev, "Couldn't ack slice error interrupt: %d\n",
+			rc);
+
+	return IRQ_HANDLED;
+}
+
+
+static int irq_alloc_range(struct cxl *adapter, int len, int *irq)
+{
+	int i, n;
+	struct irq_avail *cur;
+
+	for (i = 0; i < adapter->guest->irq_nranges; i++) {
+		cur = &adapter->guest->irq_avail[i];
+		n = bitmap_find_next_zero_area(cur->bitmap, cur->range,
+					0, len, 0);
+		if (n < cur->range) {
+			bitmap_set(cur->bitmap, n, len);
+			*irq = cur->offset + n;
+			pr_devel("guest: allocate IRQs %#x->%#x\n",
+				*irq, *irq + len - 1);
+
+			return 0;
+		}
+	}
+	return -ENOSPC;
+}
+
+static int irq_free_range(struct cxl *adapter, int irq, int len)
+{
+	int i, n;
+	struct irq_avail *cur;
+
+	if (len == 0)
+		return -ENOENT;
+
+	for (i = 0; i < adapter->guest->irq_nranges; i++) {
+		cur = &adapter->guest->irq_avail[i];
+		if (irq >= cur->offset &&
+			(irq + len) <= (cur->offset + cur->range)) {
+			n = irq - cur->offset;
+			bitmap_clear(cur->bitmap, n, len);
+			pr_devel("guest: release IRQs %#x->%#x\n",
+				irq, irq + len - 1);
+			return 0;
+		}
+	}
+	return -ENOENT;
+}
+
+static int guest_reset(struct cxl *adapter)
+{
+	struct cxl_afu *afu = NULL;
+	int i, rc;
+
+	pr_devel("Adapter reset request\n");
+	for (i = 0; i < adapter->slices; i++) {
+		if ((afu = adapter->afu[i])) {
+			pci_error_handlers(afu, CXL_ERROR_DETECTED_EVENT,
+					pci_channel_io_frozen);
+			cxl_context_detach_all(afu);
+		}
+	}
+
+	rc = cxl_h_reset_adapter(adapter->guest->handle);
+	for (i = 0; i < adapter->slices; i++) {
+		if (!rc && (afu = adapter->afu[i])) {
+			pci_error_handlers(afu, CXL_SLOT_RESET_EVENT,
+					pci_channel_io_normal);
+			pci_error_handlers(afu, CXL_RESUME_EVENT, 0);
+		}
+	}
+	return rc;
+}
+
+static int guest_alloc_one_irq(struct cxl *adapter)
+{
+	int irq;
+
+	spin_lock(&adapter->guest->irq_alloc_lock);
+	if (irq_alloc_range(adapter, 1, &irq))
+		irq = -ENOSPC;
+	spin_unlock(&adapter->guest->irq_alloc_lock);
+	return irq;
+}
+
+static void guest_release_one_irq(struct cxl *adapter, int irq)
+{
+	spin_lock(&adapter->guest->irq_alloc_lock);
+	irq_free_range(adapter, irq, 1);
+	spin_unlock(&adapter->guest->irq_alloc_lock);
+}
+
+static int guest_alloc_irq_ranges(struct cxl_irq_ranges *irqs,
+				struct cxl *adapter, unsigned int num)
+{
+	int i, try, irq;
+
+	memset(irqs, 0, sizeof(struct cxl_irq_ranges));
+
+	spin_lock(&adapter->guest->irq_alloc_lock);
+	for (i = 0; i < CXL_IRQ_RANGES && num; i++) {
+		try = num;
+		while (try) {
+			if (irq_alloc_range(adapter, try, &irq) == 0)
+				break;
+			try /= 2;
+		}
+		if (!try)
+			goto error;
+		irqs->offset[i] = irq;
+		irqs->range[i] = try;
+		num -= try;
+	}
+	if (num)
+		goto error;
+	spin_unlock(&adapter->guest->irq_alloc_lock);
+	return 0;
+
+error:
+	for (i = 0; i < CXL_IRQ_RANGES; i++)
+		irq_free_range(adapter, irqs->offset[i], irqs->range[i]);
+	spin_unlock(&adapter->guest->irq_alloc_lock);
+	return -ENOSPC;
+}
+
+static void guest_release_irq_ranges(struct cxl_irq_ranges *irqs,
+				struct cxl *adapter)
+{
+	int i;
+
+	spin_lock(&adapter->guest->irq_alloc_lock);
+	for (i = 0; i < CXL_IRQ_RANGES; i++)
+		irq_free_range(adapter, irqs->offset[i], irqs->range[i]);
+	spin_unlock(&adapter->guest->irq_alloc_lock);
+}
+
+static int guest_register_serr_irq(struct cxl_afu *afu)
+{
+	afu->err_irq_name = kasprintf(GFP_KERNEL, "cxl-%s-err",
+				      dev_name(&afu->dev));
+	if (!afu->err_irq_name)
+		return -ENOMEM;
+
+	if (!(afu->serr_virq = cxl_map_irq(afu->adapter, afu->serr_hwirq,
+				 guest_slice_irq_err, afu, afu->err_irq_name))) {
+		kfree(afu->err_irq_name);
+		afu->err_irq_name = NULL;
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void guest_release_serr_irq(struct cxl_afu *afu)
+{
+	cxl_unmap_irq(afu->serr_virq, afu);
+	cxl_ops->release_one_irq(afu->adapter, afu->serr_hwirq);
+	kfree(afu->err_irq_name);
+}
+
+static int guest_ack_irq(struct cxl_context *ctx, u64 tfc, u64 psl_reset_mask)
+{
+	return cxl_h_control_faults(ctx->afu->guest->handle, ctx->process_token,
+				tfc >> 32, (psl_reset_mask != 0));
+}
+
+static void disable_afu_irqs(struct cxl_context *ctx)
+{
+	irq_hw_number_t hwirq;
+	unsigned int virq;
+	int r, i;
+
+	pr_devel("Disabling AFU(%d) interrupts\n", ctx->afu->slice);
+	for (r = 0; r < CXL_IRQ_RANGES; r++) {
+		hwirq = ctx->irqs.offset[r];
+		for (i = 0; i < ctx->irqs.range[r]; hwirq++, i++) {
+			virq = irq_find_mapping(NULL, hwirq);
+			disable_irq(virq);
+		}
+	}
+}
+
+static void enable_afu_irqs(struct cxl_context *ctx)
+{
+	irq_hw_number_t hwirq;
+	unsigned int virq;
+	int r, i;
+
+	pr_devel("Enabling AFU(%d) interrupts\n", ctx->afu->slice);
+	for (r = 0; r < CXL_IRQ_RANGES; r++) {
+		hwirq = ctx->irqs.offset[r];
+		for (i = 0; i < ctx->irqs.range[r]; hwirq++, i++) {
+			virq = irq_find_mapping(NULL, hwirq);
+			enable_irq(virq);
+		}
+	}
+}
+
+static int _guest_afu_cr_readXX(int sz, struct cxl_afu *afu, int cr_idx,
+			u64 offset, u64 *val)
+{
+	unsigned long cr;
+	char c;
+	int rc = 0;
+
+	if (afu->crs_len < sz)
+		return -ENOENT;
+
+	if (unlikely(offset >= afu->crs_len))
+		return -ERANGE;
+
+	cr = get_zeroed_page(GFP_KERNEL);
+	if (!cr)
+		return -ENOMEM;
+
+	rc = cxl_h_get_config(afu->guest->handle, cr_idx, offset,
+			virt_to_phys((void *)cr), sz);
+	if (rc)
+		goto err;
+
+	switch (sz) {
+	case 1:
+		c = *((char *) cr);
+		*val = c;
+		break;
+	case 2:
+		*val = in_le16((u16 *)cr);
+		break;
+	case 4:
+		*val = in_le32((unsigned *)cr);
+		break;
+	case 8:
+		*val = in_le64((u64 *)cr);
+		break;
+	default:
+		WARN_ON(1);
+	}
+err:
+	free_page(cr);
+	return rc;
+}
+
+static int guest_afu_cr_read32(struct cxl_afu *afu, int cr_idx, u64 offset,
+			u32 *out)
+{
+	int rc;
+	u64 val;
+
+	rc = _guest_afu_cr_readXX(4, afu, cr_idx, offset, &val);
+	if (!rc)
+		*out = (u32) val;
+	return rc;
+}
+
+static int guest_afu_cr_read16(struct cxl_afu *afu, int cr_idx, u64 offset,
+			u16 *out)
+{
+	int rc;
+	u64 val;
+
+	rc = _guest_afu_cr_readXX(2, afu, cr_idx, offset, &val);
+	if (!rc)
+		*out = (u16) val;
+	return rc;
+}
+
+static int guest_afu_cr_read8(struct cxl_afu *afu, int cr_idx, u64 offset,
+			u8 *out)
+{
+	int rc;
+	u64 val;
+
+	rc = _guest_afu_cr_readXX(1, afu, cr_idx, offset, &val);
+	if (!rc)
+		*out = (u8) val;
+	return rc;
+}
+
+static int guest_afu_cr_read64(struct cxl_afu *afu, int cr_idx, u64 offset,
+			u64 *out)
+{
+	return _guest_afu_cr_readXX(8, afu, cr_idx, offset, out);
+}
+
+static int guest_afu_cr_write32(struct cxl_afu *afu, int cr, u64 off, u32 in)
+{
+	/* config record is not writable from guest */
+	return -EPERM;
+}
+
+static int guest_afu_cr_write16(struct cxl_afu *afu, int cr, u64 off, u16 in)
+{
+	/* config record is not writable from guest */
+	return -EPERM;
+}
+
+static int guest_afu_cr_write8(struct cxl_afu *afu, int cr, u64 off, u8 in)
+{
+	/* config record is not writable from guest */
+	return -EPERM;
+}
+
+static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr)
+{
+	struct cxl_process_element_hcall *elem;
+	struct cxl *adapter = ctx->afu->adapter;
+	const struct cred *cred;
+	u32 pid, idx;
+	int rc, r, i;
+	u64 mmio_addr, mmio_size;
+	__be64 flags = 0;
+
+	/* Must be 8 byte aligned and cannot cross a 4096 byte boundary */
+	if (!(elem = (struct cxl_process_element_hcall *)
+			get_zeroed_page(GFP_KERNEL)))
+		return -ENOMEM;
+
+	elem->version = cpu_to_be64(CXL_PROCESS_ELEMENT_VERSION);
+	if (ctx->kernel) {
+		pid = 0;
+		flags |= CXL_PE_TRANSLATION_ENABLED;
+		flags |= CXL_PE_PRIVILEGED_PROCESS;
+		if (mfmsr() & MSR_SF)
+			flags |= CXL_PE_64_BIT;
+	} else {
+		pid = current->pid;
+		flags |= CXL_PE_PROBLEM_STATE;
+		flags |= CXL_PE_TRANSLATION_ENABLED;
+		if (!test_tsk_thread_flag(current, TIF_32BIT))
+			flags |= CXL_PE_64_BIT;
+		cred = get_current_cred();
+		if (uid_eq(cred->euid, GLOBAL_ROOT_UID))
+			flags |= CXL_PE_PRIVILEGED_PROCESS;
+		put_cred(cred);
+	}
+	elem->flags         = cpu_to_be64(flags);
+	elem->common.tid    = cpu_to_be32(0); /* Unused */
+	elem->common.pid    = cpu_to_be32(pid);
+	elem->common.csrp   = cpu_to_be64(0); /* disable */
+	elem->common.aurp0  = cpu_to_be64(0); /* disable */
+	elem->common.aurp1  = cpu_to_be64(0); /* disable */
+
+	cxl_prefault(ctx, wed);
+
+	elem->common.sstp0  = cpu_to_be64(ctx->sstp0);
+	elem->common.sstp1  = cpu_to_be64(ctx->sstp1);
+	for (r = 0; r < CXL_IRQ_RANGES; r++) {
+		for (i = 0; i < ctx->irqs.range[r]; i++) {
+			if (r == 0 && i == 0) {
+				elem->pslVirtualIsn = cpu_to_be32(ctx->irqs.offset[0]);
+			} else {
+				idx = ctx->irqs.offset[r] + i - adapter->guest->irq_base_offset;
+				elem->applicationVirtualIsnBitmap[idx / 8] |= 0x80 >> (idx % 8);
+			}
+		}
+	}
+	elem->common.amr = cpu_to_be64(amr);
+	elem->common.wed = cpu_to_be64(wed);
+
+	disable_afu_irqs(ctx);
+
+	rc = cxl_h_attach_process(ctx->afu->guest->handle, elem,
+				&ctx->process_token, &mmio_addr, &mmio_size);
+	if (rc == H_SUCCESS) {
+		if (ctx->master || !ctx->afu->pp_psa) {
+			ctx->psn_phys = ctx->afu->psn_phys;
+			ctx->psn_size = ctx->afu->adapter->ps_size;
+		} else {
+			ctx->psn_phys = mmio_addr;
+			ctx->psn_size = mmio_size;
+		}
+		if (ctx->afu->pp_psa && mmio_size &&
+			ctx->afu->pp_size == 0) {
+			/*
+			 * There's no property in the device tree to read the
+			 * pp_size. We only find out at the 1st attach.
+			 * Compared to bare-metal, it is too late and we
+			 * should really lock here. However, on powerVM,
+			 * pp_size is really only used to display in /sys.
+			 * Being discussed with pHyp for their next release.
+			 */
+			ctx->afu->pp_size = mmio_size;
+		}
+		/* from PAPR: process element is bytes 4-7 of process token */
+		ctx->external_pe = ctx->process_token & 0xFFFFFFFF;
+		pr_devel("CXL pe=%i is known as %i for pHyp, mmio_size=%#llx",
+			ctx->pe, ctx->external_pe, ctx->psn_size);
+		ctx->pe_inserted = true;
+		enable_afu_irqs(ctx);
+	}
+
+	free_page((u64)elem);
+	return rc;
+}
+
+static int guest_attach_process(struct cxl_context *ctx, bool kernel, u64 wed, u64 amr)
+{
+	pr_devel("in %s\n", __func__);
+
+	ctx->kernel = kernel;
+	if (ctx->afu->current_mode == CXL_MODE_DIRECTED)
+		return attach_afu_directed(ctx, wed, amr);
+
+	/* dedicated mode not supported on FW840 */
+
+	return -EINVAL;
+}
+
+static int detach_afu_directed(struct cxl_context *ctx)
+{
+	if (!ctx->pe_inserted)
+		return 0;
+	if (cxl_h_detach_process(ctx->afu->guest->handle, ctx->process_token))
+		return -1;
+	return 0;
+}
+
+static int guest_detach_process(struct cxl_context *ctx)
+{
+	pr_devel("in %s\n", __func__);
+	trace_cxl_detach(ctx);
+
+	if (!cxl_ops->link_ok(ctx->afu->adapter, ctx->afu))
+		return -EIO;
+
+	if (ctx->afu->current_mode == CXL_MODE_DIRECTED)
+		return detach_afu_directed(ctx);
+
+	return -EINVAL;
+}
+
+static void guest_release_afu(struct device *dev)
+{
+	struct cxl_afu *afu = to_cxl_afu(dev);
+
+	pr_devel("%s\n", __func__);
+
+	idr_destroy(&afu->contexts_idr);
+
+	kfree(afu->guest);
+	kfree(afu);
+}
+
+ssize_t cxl_guest_read_afu_vpd(struct cxl_afu *afu, void *buf, size_t len)
+{
+	return guest_collect_vpd(NULL, afu, buf, len);
+}
+
+#define ERR_BUFF_MAX_COPY_SIZE PAGE_SIZE
+static ssize_t guest_afu_read_err_buffer(struct cxl_afu *afu, char *buf,
+					loff_t off, size_t count)
+{
+	void *tbuf = NULL;
+	int rc = 0;
+
+	tbuf = (void *) get_zeroed_page(GFP_KERNEL);
+	if (!tbuf)
+		return -ENOMEM;
+
+	rc = cxl_h_get_afu_err(afu->guest->handle,
+			       off & 0x7,
+			       virt_to_phys(tbuf),
+			       count);
+	if (rc)
+		goto err;
+
+	if (count > ERR_BUFF_MAX_COPY_SIZE)
+		count = ERR_BUFF_MAX_COPY_SIZE - (off & 0x7);
+	memcpy(buf, tbuf, count);
+err:
+	free_page((u64)tbuf);
+
+	return rc;
+}
+
+static int guest_afu_check_and_enable(struct cxl_afu *afu)
+{
+	return 0;
+}
+
+static bool guest_support_attributes(const char *attr_name,
+				     enum cxl_attrs type)
+{
+	switch (type) {
+	case CXL_ADAPTER_ATTRS:
+		if ((strcmp(attr_name, "base_image") == 0) ||
+			(strcmp(attr_name, "load_image_on_perst") == 0) ||
+			(strcmp(attr_name, "perst_reloads_same_image") == 0) ||
+			(strcmp(attr_name, "image_loaded") == 0))
+			return false;
+		break;
+	case CXL_AFU_MASTER_ATTRS:
+		if ((strcmp(attr_name, "pp_mmio_off") == 0))
+			return false;
+		break;
+	case CXL_AFU_ATTRS:
+		break;
+	default:
+		break;
+	}
+
+	return true;
+}
+
+static int activate_afu_directed(struct cxl_afu *afu)
+{
+	int rc;
+
+	dev_info(&afu->dev, "Activating AFU(%d) directed mode\n", afu->slice);
+
+	afu->current_mode = CXL_MODE_DIRECTED;
+
+	afu->num_procs = afu->max_procs_virtualised;
+
+	if ((rc = cxl_chardev_m_afu_add(afu)))
+		return rc;
+
+	if ((rc = cxl_sysfs_afu_m_add(afu)))
+		goto err;
+
+	if ((rc = cxl_chardev_s_afu_add(afu)))
+		goto err1;
+
+	return 0;
+err1:
+	cxl_sysfs_afu_m_remove(afu);
+err:
+	cxl_chardev_afu_remove(afu);
+	return rc;
+}
+
+static int guest_afu_activate_mode(struct cxl_afu *afu, int mode)
+{
+	if (!mode)
+		return 0;
+	if (!(mode & afu->modes_supported))
+		return -EINVAL;
+
+	if (mode == CXL_MODE_DIRECTED)
+		return activate_afu_directed(afu);
+
+	if (mode == CXL_MODE_DEDICATED)
+		dev_err(&afu->dev, "Dedicated mode not supported\n");
+
+	return -EINVAL;
+}
+
+static int deactivate_afu_directed(struct cxl_afu *afu)
+{
+	dev_info(&afu->dev, "Deactivating AFU(%d) directed mode\n", afu->slice);
+
+	afu->current_mode = 0;
+	afu->num_procs = 0;
+
+	cxl_sysfs_afu_m_remove(afu);
+	cxl_chardev_afu_remove(afu);
+
+	cxl_ops->afu_reset(afu);
+
+	return 0;
+}
+
+static int guest_afu_deactivate_mode(struct cxl_afu *afu, int mode)
+{
+	if (!mode)
+		return 0;
+	if (!(mode & afu->modes_supported))
+		return -EINVAL;
+
+	if (mode == CXL_MODE_DIRECTED)
+		return deactivate_afu_directed(afu);
+	return 0;
+}
+
+static int guest_afu_reset(struct cxl_afu *afu)
+{
+	pr_devel("AFU(%d) reset request\n", afu->slice);
+	return cxl_h_reset_afu(afu->guest->handle);
+}
+
+static int guest_map_slice_regs(struct cxl_afu *afu)
+{
+	if (!(afu->p2n_mmio = ioremap(afu->guest->p2n_phys, afu->guest->p2n_size))) {
+		dev_err(&afu->dev, "Error mapping AFU(%d) MMIO regions\n",
+			afu->slice);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static void guest_unmap_slice_regs(struct cxl_afu *afu)
+{
+	if (afu->p2n_mmio)
+		iounmap(afu->p2n_mmio);
+}
+
+static int afu_update_state(struct cxl_afu *afu)
+{
+	int rc, cur_state;
+
+	rc = afu_read_error_state(afu, &cur_state);
+	if (rc)
+		return rc;
+
+	if (afu->guest->previous_state == cur_state)
+		return 0;
+
+	pr_devel("AFU(%d) update state to %#x\n", afu->slice, cur_state);
+
+	switch (cur_state) {
+	case H_STATE_NORMAL:
+		afu->guest->previous_state = cur_state;
+		rc = 1;
+		break;
+
+	case H_STATE_DISABLE:
+		pci_error_handlers(afu, CXL_ERROR_DETECTED_EVENT,
+				pci_channel_io_frozen);
+
+		cxl_context_detach_all(afu);
+		if ((rc = cxl_ops->afu_reset(afu)))
+			pr_devel("reset hcall failed %d\n", rc);
+
+		rc = afu_read_error_state(afu, &cur_state);
+		if (!rc && cur_state == H_STATE_NORMAL) {
+			pci_error_handlers(afu, CXL_SLOT_RESET_EVENT,
+					pci_channel_io_normal);
+			pci_error_handlers(afu, CXL_RESUME_EVENT, 0);
+			rc = 1;
+		}
+		afu->guest->previous_state = 0;
+		break;
+
+	case H_STATE_TEMP_UNAVAILABLE:
+		afu->guest->previous_state = cur_state;
+		break;
+
+	case H_STATE_PERM_UNAVAILABLE:
+		dev_err(&afu->dev, "AFU is in permanent error state\n");
+		pci_error_handlers(afu, CXL_ERROR_DETECTED_EVENT,
+				pci_channel_io_perm_failure);
+		afu->guest->previous_state = cur_state;
+		break;
+
+	default:
+		pr_err("Unexpected AFU(%d) error state: %#x\n",
+		       afu->slice, cur_state);
+		return -EINVAL;
+	}
+
+	return rc;
+}
+
+static int afu_do_recovery(struct cxl_afu *afu)
+{
+	int rc;
+
+	/* many threads can arrive here, in case of detach_all for example.
+	 * Only one needs to drive the recovery
+	 */
+	if (mutex_trylock(&afu->guest->recovery_lock)) {
+		rc = afu_update_state(afu);
+		mutex_unlock(&afu->guest->recovery_lock);
+		return rc;
+	}
+	return 0;
+}
+
+static bool guest_link_ok(struct cxl *cxl, struct cxl_afu *afu)
+{
+	int state;
+
+	if (afu) {
+		if (afu_read_error_state(afu, &state) ||
+			state != H_STATE_NORMAL) {
+			if (afu_do_recovery(afu) > 0) {
+				/* check again in case we've just fixed it */
+				if (!afu_read_error_state(afu, &state) &&
+					state == H_STATE_NORMAL)
+					return true;
+			}
+			return false;
+		}
+	}
+
+	return true;
+}
+
+static int afu_properties_look_ok(struct cxl_afu *afu)
+{
+	if (afu->pp_irqs < 0) {
+		dev_err(&afu->dev, "Unexpected per-process minimum interrupt value\n");
+		return -EINVAL;
+	}
+
+	if (afu->max_procs_virtualised < 1) {
+		dev_err(&afu->dev, "Unexpected max number of processes virtualised value\n");
+		return -EINVAL;
+	}
+
+	if (afu->crs_len < 0) {
+		dev_err(&afu->dev, "Unexpected configuration record size value\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int cxl_guest_init_afu(struct cxl *adapter, int slice, struct device_node *afu_np)
+{
+	struct cxl_afu *afu;
+	bool free = true;
+	int rc;
+
+	pr_devel("in %s - AFU(%d)\n", __func__, slice);
+	if (!(afu = cxl_alloc_afu(adapter, slice)))
+		return -ENOMEM;
+
+	if (!(afu->guest = kzalloc(sizeof(struct cxl_afu_guest), GFP_KERNEL))) {
+		kfree(afu);
+		return -ENOMEM;
+	}
+
+	mutex_init(&afu->guest->recovery_lock);
+
+	if ((rc = dev_set_name(&afu->dev, "afu%i.%i",
+					  adapter->adapter_num,
+					  slice)))
+		goto err1;
+
+	adapter->slices++;
+
+	if ((rc = cxl_of_read_afu_handle(afu, afu_np)))
+		goto err1;
+
+	if ((rc = cxl_ops->afu_reset(afu)))
+		goto err1;
+
+	if ((rc = cxl_of_read_afu_properties(afu, afu_np)))
+		goto err1;
+
+	if ((rc = afu_properties_look_ok(afu)))
+		goto err1;
+
+	if ((rc = guest_map_slice_regs(afu)))
+		goto err1;
+
+	if ((rc = guest_register_serr_irq(afu)))
+		goto err2;
+
+	/*
+	 * After we call this function we must not free the afu directly, even
+	 * if it returns an error!
+	 */
+	if ((rc = cxl_register_afu(afu)))
+		goto err_put1;
+
+	if ((rc = cxl_sysfs_afu_add(afu)))
+		goto err_put1;
+
+	/*
+	 * pHyp doesn't expose the programming models supported by the
+	 * AFU. pHyp currently only supports directed mode. If it adds
+	 * dedicated mode later, this version of cxl has no way to
+	 * detect it. So we'll initialize the driver, but the first
+	 * attach will fail.
+	 * Being discussed with pHyp to do better (likely new property)
+	 */
+	if (afu->max_procs_virtualised == 1)
+		afu->modes_supported = CXL_MODE_DEDICATED;
+	else
+		afu->modes_supported = CXL_MODE_DIRECTED;
+
+	if ((rc = cxl_afu_select_best_mode(afu)))
+		goto err_put2;
+
+	adapter->afu[afu->slice] = afu;
+
+	afu->enabled = true;
+
+	if ((rc = cxl_pci_vphb_add(afu)))
+		dev_info(&afu->dev, "Can't register vPHB\n");
+
+	return 0;
+
+err_put2:
+	cxl_sysfs_afu_remove(afu);
+err_put1:
+	device_unregister(&afu->dev);
+	free = false;
+	guest_release_serr_irq(afu);
+err2:
+	guest_unmap_slice_regs(afu);
+err1:
+	if (free) {
+		kfree(afu->guest);
+		kfree(afu);
+	}
+	return rc;
+}
+
+void cxl_guest_remove_afu(struct cxl_afu *afu)
+{
+	pr_devel("in %s - AFU(%d)\n", __func__, afu->slice);
+
+	if (!afu)
+		return;
+
+	cxl_pci_vphb_remove(afu);
+	cxl_sysfs_afu_remove(afu);
+
+	spin_lock(&afu->adapter->afu_list_lock);
+	afu->adapter->afu[afu->slice] = NULL;
+	spin_unlock(&afu->adapter->afu_list_lock);
+
+	cxl_context_detach_all(afu);
+	cxl_ops->afu_deactivate_mode(afu, afu->current_mode);
+	guest_release_serr_irq(afu);
+	guest_unmap_slice_regs(afu);
+
+	device_unregister(&afu->dev);
+}
+
+static void free_adapter(struct cxl *adapter)
+{
+	struct irq_avail *cur;
+	int i;
+
+	if (adapter->guest->irq_avail) {
+		for (i = 0; i < adapter->guest->irq_nranges; i++) {
+			cur = &adapter->guest->irq_avail[i];
+			kfree(cur->bitmap);
+		}
+		kfree(adapter->guest->irq_avail);
+	}
+	kfree(adapter->guest->status);
+	cxl_remove_adapter_nr(adapter);
+	kfree(adapter->guest);
+	kfree(adapter);
+}
+
+static int properties_look_ok(struct cxl *adapter)
+{
+	/* The absence of this property means that the operational
+	 * status is unknown or okay
+	 */
+	if (strlen(adapter->guest->status) &&
+	    strcmp(adapter->guest->status, "okay")) {
+		pr_err("ABORTING:Bad operational status of the device\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+ssize_t cxl_guest_read_adapter_vpd(struct cxl *adapter, void *buf, size_t len)
+{
+	return guest_collect_vpd(adapter, NULL, buf, len);
+}
+
+void cxl_guest_remove_adapter(struct cxl *adapter)
+{
+	pr_devel("in %s\n", __func__);
+
+	cxl_sysfs_adapter_remove(adapter);
+
+	cxl_guest_remove_chardev(adapter);
+	device_unregister(&adapter->dev);
+}
+
+static void release_adapter(struct device *dev)
+{
+	free_adapter(to_cxl_adapter(dev));
+}
+
+struct cxl *cxl_guest_init_adapter(struct device_node *np, struct platform_device *pdev)
+{
+	struct cxl *adapter;
+	bool free = true;
+	int rc;
+
+	if (!(adapter = cxl_alloc_adapter()))
+		return ERR_PTR(-ENOMEM);
+
+	if (!(adapter->guest = kzalloc(sizeof(struct cxl_guest), GFP_KERNEL))) {
+		free_adapter(adapter);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	adapter->slices = 0;
+	adapter->guest->pdev = pdev;
+	adapter->dev.parent = &pdev->dev;
+	adapter->dev.release = release_adapter;
+	dev_set_drvdata(&pdev->dev, adapter);
+
+	if ((rc = cxl_of_read_adapter_handle(adapter, np)))
+		goto err1;
+
+	if ((rc = cxl_of_read_adapter_properties(adapter, np)))
+		goto err1;
+
+	if ((rc = properties_look_ok(adapter)))
+		goto err1;
+
+	if ((rc = cxl_guest_add_chardev(adapter)))
+		goto err1;
+
+	/*
+	 * After we call this function we must not free the adapter directly,
+	 * even if it returns an error!
+	 */
+	if ((rc = cxl_register_adapter(adapter)))
+		goto err_put1;
+
+	if ((rc = cxl_sysfs_adapter_add(adapter)))
+		goto err_put1;
+
+	return adapter;
+
+err_put1:
+	device_unregister(&adapter->dev);
+	free = false;
+	cxl_guest_remove_chardev(adapter);
+err1:
+	if (free)
+		free_adapter(adapter);
+	return ERR_PTR(rc);
+}
+
+void cxl_guest_reload_module(struct cxl *adapter)
+{
+	struct platform_device *pdev;
+
+	pdev = adapter->guest->pdev;
+	cxl_guest_remove_adapter(adapter);
+
+	cxl_of_probe(pdev);
+}
+
+const struct cxl_backend_ops cxl_guest_ops = {
+	.module = THIS_MODULE,
+	.adapter_reset = guest_reset,
+	.alloc_one_irq = guest_alloc_one_irq,
+	.release_one_irq = guest_release_one_irq,
+	.alloc_irq_ranges = guest_alloc_irq_ranges,
+	.release_irq_ranges = guest_release_irq_ranges,
+	.setup_irq = NULL,
+	.handle_psl_slice_error = guest_handle_psl_slice_error,
+	.psl_interrupt = guest_psl_irq,
+	.ack_irq = guest_ack_irq,
+	.attach_process = guest_attach_process,
+	.detach_process = guest_detach_process,
+	.support_attributes = guest_support_attributes,
+	.link_ok = guest_link_ok,
+	.release_afu = guest_release_afu,
+	.afu_read_err_buffer = guest_afu_read_err_buffer,
+	.afu_check_and_enable = guest_afu_check_and_enable,
+	.afu_activate_mode = guest_afu_activate_mode,
+	.afu_deactivate_mode = guest_afu_deactivate_mode,
+	.afu_reset = guest_afu_reset,
+	.afu_cr_read8 = guest_afu_cr_read8,
+	.afu_cr_read16 = guest_afu_cr_read16,
+	.afu_cr_read32 = guest_afu_cr_read32,
+	.afu_cr_read64 = guest_afu_cr_read64,
+	.afu_cr_write8 = guest_afu_cr_write8,
+	.afu_cr_write16 = guest_afu_cr_write16,
+	.afu_cr_write32 = guest_afu_cr_write32,
+	.read_adapter_vpd = cxl_guest_read_adapter_vpd,
+};
diff --git a/drivers/misc/cxl/hcalls.c b/drivers/misc/cxl/hcalls.c
new file mode 100644
index 000000000000..d6d11f4056d7
--- /dev/null
+++ b/drivers/misc/cxl/hcalls.c
@@ -0,0 +1,647 @@
+/*
+ * Copyright 2015 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <linux/delay.h>
+#include <asm/byteorder.h>
+#include "hcalls.h"
+#include "trace.h"
+
+#define CXL_HCALL_TIMEOUT 60000
+#define CXL_HCALL_TIMEOUT_DOWNLOAD 120000
+
+#define H_ATTACH_CA_PROCESS    0x344
+#define H_CONTROL_CA_FUNCTION  0x348
+#define H_DETACH_CA_PROCESS    0x34C
+#define H_COLLECT_CA_INT_INFO  0x350
+#define H_CONTROL_CA_FAULTS    0x354
+#define H_DOWNLOAD_CA_FUNCTION 0x35C
+#define H_DOWNLOAD_CA_FACILITY 0x364
+#define H_CONTROL_CA_FACILITY  0x368
+
+#define H_CONTROL_CA_FUNCTION_RESET                   1 /* perform a reset */
+#define H_CONTROL_CA_FUNCTION_SUSPEND_PROCESS         2 /* suspend a process from being executed */
+#define H_CONTROL_CA_FUNCTION_RESUME_PROCESS          3 /* resume a process to be executed */
+#define H_CONTROL_CA_FUNCTION_READ_ERR_STATE          4 /* read the error state */
+#define H_CONTROL_CA_FUNCTION_GET_AFU_ERR             5 /* collect the AFU error buffer */
+#define H_CONTROL_CA_FUNCTION_GET_CONFIG              6 /* collect configuration record */
+#define H_CONTROL_CA_FUNCTION_GET_DOWNLOAD_STATE      7 /* query to return download status */
+#define H_CONTROL_CA_FUNCTION_TERMINATE_PROCESS       8 /* terminate the process before completion */
+#define H_CONTROL_CA_FUNCTION_COLLECT_VPD             9 /* collect VPD */
+#define H_CONTROL_CA_FUNCTION_GET_FUNCTION_ERR_INT   11 /* read the function-wide error data based on an interrupt */
+#define H_CONTROL_CA_FUNCTION_ACK_FUNCTION_ERR_INT   12 /* acknowledge function-wide error data based on an interrupt */
+#define H_CONTROL_CA_FUNCTION_GET_ERROR_LOG          13 /* retrieve the Platform Log ID (PLID) of an error log */
+
+#define H_CONTROL_CA_FAULTS_RESPOND_PSL         1
+#define H_CONTROL_CA_FAULTS_RESPOND_AFU         2
+
+#define H_CONTROL_CA_FACILITY_RESET             1 /* perform a reset */
+#define H_CONTROL_CA_FACILITY_COLLECT_VPD       2 /* collect VPD */
+
+#define H_DOWNLOAD_CA_FACILITY_DOWNLOAD         1 /* download adapter image */
+#define H_DOWNLOAD_CA_FACILITY_VALIDATE         2 /* validate adapter image */
+
+
+#define _CXL_LOOP_HCALL(call, rc, retbuf, fn, ...)			\
+	{								\
+		unsigned int delay, total_delay = 0;			\
+		u64 token = 0;						\
+									\
+		memset(retbuf, 0, sizeof(retbuf));			\
+		while (1) {						\
+			rc = call(fn, retbuf, __VA_ARGS__, token);	\
+			token = retbuf[0];				\
+			if (rc != H_BUSY && !H_IS_LONG_BUSY(rc))	\
+				break;					\
+									\
+			if (rc == H_BUSY)				\
+				delay = 10;				\
+			else						\
+				delay = get_longbusy_msecs(rc);		\
+									\
+			total_delay += delay;				\
+			if (total_delay > CXL_HCALL_TIMEOUT) {		\
+				WARN(1, "Warning: Giving up waiting for CXL hcall " \
+					"%#x after %u msec\n", fn, total_delay); \
+				rc = H_BUSY;				\
+				break;					\
+			}						\
+			msleep(delay);					\
+		}							\
+	}
+#define CXL_H_WAIT_UNTIL_DONE(...)  _CXL_LOOP_HCALL(plpar_hcall, __VA_ARGS__)
+#define CXL_H9_WAIT_UNTIL_DONE(...) _CXL_LOOP_HCALL(plpar_hcall9, __VA_ARGS__)
+
+#define _PRINT_MSG(rc, format, ...)					\
+	{								\
+		if ((rc != H_SUCCESS) && (rc != H_CONTINUE))		\
+			pr_err(format, __VA_ARGS__);			\
+		else							\
+			pr_devel(format, __VA_ARGS__);			\
+	}								\
+
+
+static char *afu_op_names[] = {
+	"UNKNOWN_OP",		/* 0 undefined */
+	"RESET",		/* 1 */
+	"SUSPEND_PROCESS",	/* 2 */
+	"RESUME_PROCESS",	/* 3 */
+	"READ_ERR_STATE",	/* 4 */
+	"GET_AFU_ERR",		/* 5 */
+	"GET_CONFIG",		/* 6 */
+	"GET_DOWNLOAD_STATE",	/* 7 */
+	"TERMINATE_PROCESS",	/* 8 */
+	"COLLECT_VPD",		/* 9 */
+	"UNKNOWN_OP",		/* 10 undefined */
+	"GET_FUNCTION_ERR_INT",	/* 11 */
+	"ACK_FUNCTION_ERR_INT",	/* 12 */
+	"GET_ERROR_LOG",	/* 13 */
+};
+
+static char *control_adapter_op_names[] = {
+	"UNKNOWN_OP",		/* 0 undefined */
+	"RESET",		/* 1 */
+	"COLLECT_VPD",		/* 2 */
+};
+
+static char *download_op_names[] = {
+	"UNKNOWN_OP",		/* 0 undefined */
+	"DOWNLOAD",		/* 1 */
+	"VALIDATE",		/* 2 */
+};
+
+static char *op_str(unsigned int op, char *name_array[], int array_len)
+{
+	if (op >= array_len)
+		return "UNKNOWN_OP";
+	return name_array[op];
+}
+
+#define OP_STR(op, name_array)      op_str(op, name_array, ARRAY_SIZE(name_array))
+
+#define OP_STR_AFU(op)              OP_STR(op, afu_op_names)
+#define OP_STR_CONTROL_ADAPTER(op)  OP_STR(op, control_adapter_op_names)
+#define OP_STR_DOWNLOAD_ADAPTER(op) OP_STR(op, download_op_names)
+
+
+long cxl_h_attach_process(u64 unit_address,
+			struct cxl_process_element_hcall *element,
+			u64 *process_token, u64 *mmio_addr, u64 *mmio_size)
+{
+	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+	long rc;
+
+	CXL_H_WAIT_UNTIL_DONE(rc, retbuf, H_ATTACH_CA_PROCESS, unit_address, virt_to_phys(element));
+	_PRINT_MSG(rc, "cxl_h_attach_process(%#.16llx, %#.16lx): %li\n",
+		unit_address, virt_to_phys(element), rc);
+	trace_cxl_hcall_attach(unit_address, virt_to_phys(element), retbuf[0], retbuf[1], retbuf[2], rc);
+
+	pr_devel("token: 0x%.8lx mmio_addr: 0x%lx mmio_size: 0x%lx\nProcess Element Structure:\n",
+		retbuf[0], retbuf[1], retbuf[2]);
+	cxl_dump_debug_buffer(element, sizeof(*element));
+
+	switch (rc) {
+	case H_SUCCESS:       /* The process info is attached to the coherent platform function */
+		*process_token = retbuf[0];
+		if (mmio_addr)
+			*mmio_addr = retbuf[1];
+		if (mmio_size)
+			*mmio_size = retbuf[2];
+		return 0;
+	case H_PARAMETER:     /* An incorrect parameter was supplied. */
+	case H_FUNCTION:      /* The function is not supported. */
+		return -EINVAL;
+	case H_AUTHORITY:     /* The partition does not have authority to perform this hcall */
+	case H_RESOURCE:      /* The coherent platform function does not have enough additional resource to attach the process */
+	case H_HARDWARE:      /* A hardware event prevented the attach operation */
+	case H_STATE:         /* The coherent platform function is not in a valid state */
+	case H_BUSY:
+		return -EBUSY;
+	default:
+		WARN(1, "Unexpected return code: %lx", rc);
+		return -EINVAL;
+	}
+}
+
+/**
+ * cxl_h_detach_process - Detach a process element from a coherent
+ *                        platform function.
+ */
+long cxl_h_detach_process(u64 unit_address, u64 process_token)
+{
+	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+	long rc;
+
+	CXL_H_WAIT_UNTIL_DONE(rc, retbuf, H_DETACH_CA_PROCESS, unit_address, process_token);
+	_PRINT_MSG(rc, "cxl_h_detach_process(%#.16llx, 0x%.8llx): %li\n", unit_address, process_token, rc);
+	trace_cxl_hcall_detach(unit_address, process_token, rc);
+
+	switch (rc) {
+	case H_SUCCESS:       /* The process was detached from the coherent platform function */
+		return 0;
+	case H_PARAMETER:     /* An incorrect parameter was supplied. */
+		return -EINVAL;
+	case H_AUTHORITY:     /* The partition does not have authority to perform this hcall */
+	case H_RESOURCE:      /* The function has page table mappings for MMIO */
+	case H_HARDWARE:      /* A hardware event prevented the detach operation */
+	case H_STATE:         /* The coherent platform function is not in a valid state */
+	case H_BUSY:
+		return -EBUSY;
+	default:
+		WARN(1, "Unexpected return code: %lx", rc);
+		return -EINVAL;
+	}
+}
+
+/**
+ * cxl_h_control_function - This H_CONTROL_CA_FUNCTION hypervisor call allows
+ *                          the partition to manipulate or query
+ *                          certain coherent platform function behaviors.
+ */
+static long cxl_h_control_function(u64 unit_address, u64 op,
+				   u64 p1, u64 p2, u64 p3, u64 p4, u64 *out)
+{
+	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+	long rc;
+
+	CXL_H9_WAIT_UNTIL_DONE(rc, retbuf, H_CONTROL_CA_FUNCTION, unit_address, op, p1, p2, p3, p4);
+	_PRINT_MSG(rc, "cxl_h_control_function(%#.16llx, %s(%#llx, %#llx, %#llx, %#llx, R4: %#lx)): %li\n",
+		unit_address, OP_STR_AFU(op), p1, p2, p3, p4, retbuf[0], rc);
+	trace_cxl_hcall_control_function(unit_address, OP_STR_AFU(op), p1, p2, p3, p4, retbuf[0], rc);
+
+	switch (rc) {
+	case H_SUCCESS:       /* The operation is completed for the coherent platform function */
+		if ((op == H_CONTROL_CA_FUNCTION_GET_FUNCTION_ERR_INT ||
+		     op == H_CONTROL_CA_FUNCTION_READ_ERR_STATE ||
+		     op == H_CONTROL_CA_FUNCTION_COLLECT_VPD))
+			*out = retbuf[0];
+		return 0;
+	case H_PARAMETER:     /* An incorrect parameter was supplied. */
+	case H_FUNCTION:      /* The function is not supported. */
+	case H_NOT_FOUND:     /* The operation supplied was not valid */
+	case H_NOT_AVAILABLE: /* The operation cannot be performed because the AFU has not been downloaded */
+	case H_SG_LIST:       /* An block list entry was invalid */
+		return -EINVAL;
+	case H_AUTHORITY:     /* The partition does not have authority to perform this hcall */
+	case H_RESOURCE:      /* The function has page table mappings for MMIO */
+	case H_HARDWARE:      /* A hardware event prevented the attach operation */
+	case H_STATE:         /* The coherent platform function is not in a valid state */
+	case H_BUSY:
+		return -EBUSY;
+	default:
+		WARN(1, "Unexpected return code: %lx", rc);
+		return -EINVAL;
+	}
+}
+
+/**
+ * cxl_h_reset_afu - Perform a reset to the coherent platform function.
+ */
+long cxl_h_reset_afu(u64 unit_address)
+{
+	return cxl_h_control_function(unit_address,
+				H_CONTROL_CA_FUNCTION_RESET,
+				0, 0, 0, 0,
+				NULL);
+}
+
+/**
+ * cxl_h_suspend_process - Suspend a process from being executed
+ * Parameter1 = process-token as returned from H_ATTACH_CA_PROCESS when
+ *              process was attached.
+ */
+long cxl_h_suspend_process(u64 unit_address, u64 process_token)
+{
+	return cxl_h_control_function(unit_address,
+				H_CONTROL_CA_FUNCTION_SUSPEND_PROCESS,
+				process_token, 0, 0, 0,
+				NULL);
+}
+
+/**
+ * cxl_h_resume_process - Resume a process to be executed
+ * Parameter1 = process-token as returned from H_ATTACH_CA_PROCESS when
+ *              process was attached.
+ */
+long cxl_h_resume_process(u64 unit_address, u64 process_token)
+{
+	return cxl_h_control_function(unit_address,
+				H_CONTROL_CA_FUNCTION_RESUME_PROCESS,
+				process_token, 0, 0, 0,
+				NULL);
+}
+
+/**
+ * cxl_h_read_error_state - Checks the error state of the coherent
+ *                          platform function.
+ * R4 contains the error state
+ */
+long cxl_h_read_error_state(u64 unit_address, u64 *state)
+{
+	return cxl_h_control_function(unit_address,
+				H_CONTROL_CA_FUNCTION_READ_ERR_STATE,
+				0, 0, 0, 0,
+				state);
+}
+
+/**
+ * cxl_h_get_afu_err - collect the AFU error buffer
+ * Parameter1 = byte offset into error buffer to retrieve, valid values
+ *              are between 0 and (ibm,error-buffer-size - 1)
+ * Parameter2 = 4K aligned real address of error buffer, to be filled in
+ * Parameter3 = length of error buffer, valid values are 4K or less
+ */
+long cxl_h_get_afu_err(u64 unit_address, u64 offset,
+		u64 buf_address, u64 len)
+{
+	return cxl_h_control_function(unit_address,
+				H_CONTROL_CA_FUNCTION_GET_AFU_ERR,
+				offset, buf_address, len, 0,
+				NULL);
+}
+
+/**
+ * cxl_h_get_config - collect configuration record for the
+ *                    coherent platform function
+ * Parameter1 = # of configuration record to retrieve, valid values are
+ *              between 0 and (ibm,#config-records - 1)
+ * Parameter2 = byte offset into configuration record to retrieve,
+ *              valid values are between 0 and (ibm,config-record-size - 1)
+ * Parameter3 = 4K aligned real address of configuration record buffer,
+ *              to be filled in
+ * Parameter4 = length of configuration buffer, valid values are 4K or less
+ */
+long cxl_h_get_config(u64 unit_address, u64 cr_num, u64 offset,
+		u64 buf_address, u64 len)
+{
+	return cxl_h_control_function(unit_address,
+				H_CONTROL_CA_FUNCTION_GET_CONFIG,
+				cr_num, offset, buf_address, len,
+				NULL);
+}
+
+/**
+ * cxl_h_terminate_process - Terminate the process before completion
+ * Parameter1 = process-token as returned from H_ATTACH_CA_PROCESS when
+ *              process was attached.
+ */
+long cxl_h_terminate_process(u64 unit_address, u64 process_token)
+{
+	return cxl_h_control_function(unit_address,
+				H_CONTROL_CA_FUNCTION_TERMINATE_PROCESS,
+				process_token, 0, 0, 0,
+				NULL);
+}
+
+/**
+ * cxl_h_collect_vpd - Collect VPD for the coherent platform function.
+ * Parameter1 = # of VPD record to retrieve, valid values are between 0
+ *              and (ibm,#config-records - 1).
+ * Parameter2 = 4K naturally aligned real buffer containing block
+ *              list entries
+ * Parameter3 = number of block list entries in the block list, valid
+ *              values are between 0 and 256
+ */
+long cxl_h_collect_vpd(u64 unit_address, u64 record, u64 list_address,
+		       u64 num, u64 *out)
+{
+	return cxl_h_control_function(unit_address,
+				H_CONTROL_CA_FUNCTION_COLLECT_VPD,
+				record, list_address, num, 0,
+				out);
+}
+
+/**
+ * cxl_h_get_fn_error_interrupt - Read the function-wide error data based on an interrupt
+ */
+long cxl_h_get_fn_error_interrupt(u64 unit_address, u64 *reg)
+{
+	return cxl_h_control_function(unit_address,
+				H_CONTROL_CA_FUNCTION_GET_FUNCTION_ERR_INT,
+				0, 0, 0, 0, reg);
+}
+
+/**
+ * cxl_h_ack_fn_error_interrupt - Acknowledge function-wide error data
+ *                                based on an interrupt
+ * Parameter1 = value to write to the function-wide error interrupt register
+ */
+long cxl_h_ack_fn_error_interrupt(u64 unit_address, u64 value)
+{
+	return cxl_h_control_function(unit_address,
+				H_CONTROL_CA_FUNCTION_ACK_FUNCTION_ERR_INT,
+				value, 0, 0, 0,
+				NULL);
+}
+
+/**
+ * cxl_h_get_error_log - Retrieve the Platform Log ID (PLID) of
+ *                       an error log
+ */
+long cxl_h_get_error_log(u64 unit_address, u64 value)
+{
+	return cxl_h_control_function(unit_address,
+				H_CONTROL_CA_FUNCTION_GET_ERROR_LOG,
+				0, 0, 0, 0,
+				NULL);
+}
+
+/**
+ * cxl_h_collect_int_info - Collect interrupt info about a coherent
+ *                          platform function after an interrupt occurred.
+ */
+long cxl_h_collect_int_info(u64 unit_address, u64 process_token,
+			    struct cxl_irq_info *info)
+{
+	long rc;
+
+	BUG_ON(sizeof(*info) != sizeof(unsigned long[PLPAR_HCALL9_BUFSIZE]));
+
+	rc = plpar_hcall9(H_COLLECT_CA_INT_INFO, (unsigned long *) info,
+			unit_address, process_token);
+	_PRINT_MSG(rc, "cxl_h_collect_int_info(%#.16llx, 0x%llx): %li\n",
+		unit_address, process_token, rc);
+	trace_cxl_hcall_collect_int_info(unit_address, process_token, rc);
+
+	switch (rc) {
+	case H_SUCCESS:     /* The interrupt info is returned in return registers. */
+		pr_devel("dsisr:%#llx, dar:%#llx, dsr:%#llx, pid:%u, tid:%u, afu_err:%#llx, errstat:%#llx\n",
+			info->dsisr, info->dar, info->dsr, info->pid,
+			info->tid, info->afu_err, info->errstat);
+		return 0;
+	case H_PARAMETER:   /* An incorrect parameter was supplied. */
+		return -EINVAL;
+	case H_AUTHORITY:   /* The partition does not have authority to perform this hcall. */
+	case H_HARDWARE:    /* A hardware event prevented the collection of the interrupt info.*/
+	case H_STATE:       /* The coherent platform function is not in a valid state to collect interrupt info. */
+		return -EBUSY;
+	default:
+		WARN(1, "Unexpected return code: %lx", rc);
+		return -EINVAL;
+	}
+}
+
+/**
+ * cxl_h_control_faults - Control the operation of a coherent platform
+ *                        function after a fault occurs.
+ *
+ * Parameters
+ *    control-mask: value to control the faults
+ *                  looks like PSL_TFC_An shifted >> 32
+ *    reset-mask: mask to control reset of function faults
+ *                Set reset_mask = 1 to reset PSL errors
+ */
+long cxl_h_control_faults(u64 unit_address, u64 process_token,
+			  u64 control_mask, u64 reset_mask)
+{
+	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+	long rc;
+
+	memset(retbuf, 0, sizeof(retbuf));
+
+	rc = plpar_hcall(H_CONTROL_CA_FAULTS, retbuf, unit_address,
+			H_CONTROL_CA_FAULTS_RESPOND_PSL, process_token,
+			control_mask, reset_mask);
+	_PRINT_MSG(rc, "cxl_h_control_faults(%#.16llx, 0x%llx, %#llx, %#llx): %li (%#lx)\n",
+		unit_address, process_token, control_mask, reset_mask,
+		rc, retbuf[0]);
+	trace_cxl_hcall_control_faults(unit_address, process_token,
+				control_mask, reset_mask, retbuf[0], rc);
+
+	switch (rc) {
+	case H_SUCCESS:    /* Faults were successfully controlled for the function. */
+		return 0;
+	case H_PARAMETER:  /* An incorrect parameter was supplied. */
+		return -EINVAL;
+	case H_HARDWARE:   /* A hardware event prevented the control of faults. */
+	case H_STATE:      /* The function was in an invalid state. */
+	case H_AUTHORITY:  /* The partition does not have authority to perform this hcall; the coherent platform facilities may need to be licensed. */
+		return -EBUSY;
+	case H_FUNCTION:   /* The function is not supported */
+	case H_NOT_FOUND:  /* The operation supplied was not valid */
+		return -EINVAL;
+	default:
+		WARN(1, "Unexpected return code: %lx", rc);
+		return -EINVAL;
+	}
+}
+
+/**
+ * cxl_h_control_facility - This H_CONTROL_CA_FACILITY hypervisor call
+ *                          allows the partition to manipulate or query
+ *                          certain coherent platform facility behaviors.
+ */
+static long cxl_h_control_facility(u64 unit_address, u64 op,
+				   u64 p1, u64 p2, u64 p3, u64 p4, u64 *out)
+{
+	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+	long rc;
+
+	CXL_H9_WAIT_UNTIL_DONE(rc, retbuf, H_CONTROL_CA_FACILITY, unit_address, op, p1, p2, p3, p4);
+	_PRINT_MSG(rc, "cxl_h_control_facility(%#.16llx, %s(%#llx, %#llx, %#llx, %#llx, R4: %#lx)): %li\n",
+		unit_address, OP_STR_CONTROL_ADAPTER(op), p1, p2, p3, p4, retbuf[0], rc);
+	trace_cxl_hcall_control_facility(unit_address, OP_STR_CONTROL_ADAPTER(op), p1, p2, p3, p4, retbuf[0], rc);
+
+	switch (rc) {
+	case H_SUCCESS:       /* The operation is completed for the coherent platform facility */
+		if (op == H_CONTROL_CA_FACILITY_COLLECT_VPD)
+			*out = retbuf[0];
+		return 0;
+	case H_PARAMETER:     /* An incorrect parameter was supplied. */
+	case H_FUNCTION:      /* The function is not supported. */
+	case H_NOT_FOUND:     /* The operation supplied was not valid */
+	case H_NOT_AVAILABLE: /* The operation cannot be performed because the AFU has not been downloaded */
+	case H_SG_LIST:       /* An block list entry was invalid */
+		return -EINVAL;
+	case H_AUTHORITY:     /* The partition does not have authority to perform this hcall */
+	case H_RESOURCE:      /* The function has page table mappings for MMIO */
+	case H_HARDWARE:      /* A hardware event prevented the attach operation */
+	case H_STATE:         /* The coherent platform facility is not in a valid state */
+	case H_BUSY:
+		return -EBUSY;
+	default:
+		WARN(1, "Unexpected return code: %lx", rc);
+		return -EINVAL;
+	}
+}
+
+/**
+ * cxl_h_reset_adapter - Perform a reset to the coherent platform facility.
+ */
+long cxl_h_reset_adapter(u64 unit_address)
+{
+	return cxl_h_control_facility(unit_address,
+				H_CONTROL_CA_FACILITY_RESET,
+				0, 0, 0, 0,
+				NULL);
+}
+
+/**
+ * cxl_h_collect_vpd - Collect VPD for the coherent platform function.
+ * Parameter1 = 4K naturally aligned real buffer containing block
+ *              list entries
+ * Parameter2 = number of block list entries in the block list, valid
+ *              values are between 0 and 256
+ */
+long cxl_h_collect_vpd_adapter(u64 unit_address, u64 list_address,
+			       u64 num, u64 *out)
+{
+	return cxl_h_control_facility(unit_address,
+				H_CONTROL_CA_FACILITY_COLLECT_VPD,
+				list_address, num, 0, 0,
+				out);
+}
+
+/**
+ * cxl_h_download_facility - This H_DOWNLOAD_CA_FACILITY
+ *                    hypervisor call provide platform support for
+ *                    downloading a base adapter image to the coherent
+ *                    platform facility, and for validating the entire
+ *                    image after the download.
+ * Parameters
+ *    op: operation to perform to the coherent platform function
+ *      Download: operation = 1, the base image in the coherent platform
+ *                               facility is first erased, and then
+ *                               programmed using the image supplied
+ *                               in the scatter/gather list.
+ *      Validate: operation = 2, the base image in the coherent platform
+ *                               facility is compared with the image
+ *                               supplied in the scatter/gather list.
+ *    list_address: 4K naturally aligned real buffer containing
+ *                  scatter/gather list entries.
+ *    num: number of block list entries in the scatter/gather list.
+ */
+static long cxl_h_download_facility(u64 unit_address, u64 op,
+				    u64 list_address, u64 num,
+				    u64 *out)
+{
+	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+	unsigned int delay, total_delay = 0;
+	u64 token = 0;
+	long rc;
+
+	if (*out != 0)
+		token = *out;
+
+	memset(retbuf, 0, sizeof(retbuf));
+	while (1) {
+		rc = plpar_hcall(H_DOWNLOAD_CA_FACILITY, retbuf,
+				 unit_address, op, list_address, num,
+				 token);
+		token = retbuf[0];
+		if (rc != H_BUSY && !H_IS_LONG_BUSY(rc))
+			break;
+
+		if (rc != H_BUSY) {
+			delay = get_longbusy_msecs(rc);
+			total_delay += delay;
+			if (total_delay > CXL_HCALL_TIMEOUT_DOWNLOAD) {
+				WARN(1, "Warning: Giving up waiting for CXL hcall "
+					"%#x after %u msec\n",
+					H_DOWNLOAD_CA_FACILITY, total_delay);
+				rc = H_BUSY;
+				break;
+			}
+			msleep(delay);
+		}
+	}
+	_PRINT_MSG(rc, "cxl_h_download_facility(%#.16llx, %s(%#llx, %#llx), %#lx): %li\n",
+		 unit_address, OP_STR_DOWNLOAD_ADAPTER(op), list_address, num, retbuf[0], rc);
+	trace_cxl_hcall_download_facility(unit_address, OP_STR_DOWNLOAD_ADAPTER(op), list_address, num, retbuf[0], rc);
+
+	switch (rc) {
+	case H_SUCCESS:       /* The operation is completed for the coherent platform facility */
+		return 0;
+	case H_PARAMETER:     /* An incorrect parameter was supplied */
+	case H_FUNCTION:      /* The function is not supported. */
+	case H_SG_LIST:       /* An block list entry was invalid */
+	case H_BAD_DATA:      /* Image verification failed */
+		return -EINVAL;
+	case H_AUTHORITY:     /* The partition does not have authority to perform this hcall */
+	case H_RESOURCE:      /* The function has page table mappings for MMIO */
+	case H_HARDWARE:      /* A hardware event prevented the attach operation */
+	case H_STATE:         /* The coherent platform facility is not in a valid state */
+	case H_BUSY:
+		return -EBUSY;
+	case H_CONTINUE:
+		*out = retbuf[0];
+		return 1;  /* More data is needed for the complete image */
+	default:
+		WARN(1, "Unexpected return code: %lx", rc);
+		return -EINVAL;
+	}
+}
+
+/**
+ * cxl_h_download_adapter_image - Download the base image to the coherent
+ *                                platform facility.
+ */
+long cxl_h_download_adapter_image(u64 unit_address,
+				  u64 list_address, u64 num,
+				  u64 *out)
+{
+	return cxl_h_download_facility(unit_address,
+				       H_DOWNLOAD_CA_FACILITY_DOWNLOAD,
+				       list_address, num, out);
+}
+
+/**
+ * cxl_h_validate_adapter_image - Validate the base image in the coherent
+ *                                platform facility.
+ */
+long cxl_h_validate_adapter_image(u64 unit_address,
+				  u64 list_address, u64 num,
+				  u64 *out)
+{
+	return cxl_h_download_facility(unit_address,
+				       H_DOWNLOAD_CA_FACILITY_VALIDATE,
+				       list_address, num, out);
+}
diff --git a/drivers/misc/cxl/hcalls.h b/drivers/misc/cxl/hcalls.h
new file mode 100644
index 000000000000..3e25522a5df6
--- /dev/null
+++ b/drivers/misc/cxl/hcalls.h
@@ -0,0 +1,204 @@
+/*
+ * Copyright 2015 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef _HCALLS_H
+#define _HCALLS_H
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+#include <asm/hvcall.h>
+#include "cxl.h"
+
+#define SG_BUFFER_SIZE 4096
+#define SG_MAX_ENTRIES 256
+
+struct sg_list {
+	u64 phys_addr;
+	u64 len;
+};
+
+/*
+ * This is straight out of PAPR, but replacing some of the compound fields with
+ * a single field, where they were identical to the register layout.
+ *
+ * The 'flags' parameter regroups the various bit-fields
+ */
+#define CXL_PE_CSRP_VALID			(1ULL << 63)
+#define CXL_PE_PROBLEM_STATE			(1ULL << 62)
+#define CXL_PE_SECONDARY_SEGMENT_TBL_SRCH	(1ULL << 61)
+#define CXL_PE_TAGS_ACTIVE			(1ULL << 60)
+#define CXL_PE_USER_STATE			(1ULL << 59)
+#define CXL_PE_TRANSLATION_ENABLED		(1ULL << 58)
+#define CXL_PE_64_BIT				(1ULL << 57)
+#define CXL_PE_PRIVILEGED_PROCESS		(1ULL << 56)
+
+#define CXL_PROCESS_ELEMENT_VERSION 1
+struct cxl_process_element_hcall {
+	__be64 version;
+	__be64 flags;
+	u8     reserved0[12];
+	__be32 pslVirtualIsn;
+	u8     applicationVirtualIsnBitmap[256];
+	u8     reserved1[144];
+	struct cxl_process_element_common common;
+	u8     reserved4[12];
+} __packed;
+
+#define H_STATE_NORMAL              1
+#define H_STATE_DISABLE             2
+#define H_STATE_TEMP_UNAVAILABLE    3
+#define H_STATE_PERM_UNAVAILABLE    4
+
+/* NOTE: element must be a logical real address, and must be pinned */
+long cxl_h_attach_process(u64 unit_address, struct cxl_process_element_hcall *element,
+			u64 *process_token, u64 *mmio_addr, u64 *mmio_size);
+
+/**
+ * cxl_h_detach_process - Detach a process element from a coherent
+ *                        platform function.
+ */
+long cxl_h_detach_process(u64 unit_address, u64 process_token);
+
+/**
+ * cxl_h_reset_afu - Perform a reset to the coherent platform function.
+ */
+long cxl_h_reset_afu(u64 unit_address);
+
+/**
+ * cxl_h_suspend_process - Suspend a process from being executed
+ * Parameter1 = process-token as returned from H_ATTACH_CA_PROCESS when
+ *              process was attached.
+ */
+long cxl_h_suspend_process(u64 unit_address, u64 process_token);
+
+/**
+ * cxl_h_resume_process - Resume a process to be executed
+ * Parameter1 = process-token as returned from H_ATTACH_CA_PROCESS when
+ *              process was attached.
+ */
+long cxl_h_resume_process(u64 unit_address, u64 process_token);
+
+/**
+ * cxl_h_read_error_state - Reads the error state of the coherent
+ *                          platform function.
+ * R4 contains the error state
+ */
+long cxl_h_read_error_state(u64 unit_address, u64 *state);
+
+/**
+ * cxl_h_get_afu_err - collect the AFU error buffer
+ * Parameter1 = byte offset into error buffer to retrieve, valid values
+ *              are between 0 and (ibm,error-buffer-size - 1)
+ * Parameter2 = 4K aligned real address of error buffer, to be filled in
+ * Parameter3 = length of error buffer, valid values are 4K or less
+ */
+long cxl_h_get_afu_err(u64 unit_address, u64 offset, u64 buf_address, u64 len);
+
+/**
+ * cxl_h_get_config - collect configuration record for the
+ *                    coherent platform function
+ * Parameter1 = # of configuration record to retrieve, valid values are
+ *              between 0 and (ibm,#config-records - 1)
+ * Parameter2 = byte offset into configuration record to retrieve,
+ *              valid values are between 0 and (ibm,config-record-size - 1)
+ * Parameter3 = 4K aligned real address of configuration record buffer,
+ *              to be filled in
+ * Parameter4 = length of configuration buffer, valid values are 4K or less
+ */
+long cxl_h_get_config(u64 unit_address, u64 cr_num, u64 offset,
+		u64 buf_address, u64 len);
+
+/**
+ * cxl_h_terminate_process - Terminate the process before completion
+ * Parameter1 = process-token as returned from H_ATTACH_CA_PROCESS when
+ *              process was attached.
+ */
+long cxl_h_terminate_process(u64 unit_address, u64 process_token);
+
+/**
+ * cxl_h_collect_vpd - Collect VPD for the coherent platform function.
+ * Parameter1 = # of VPD record to retrieve, valid values are between 0
+ *              and (ibm,#config-records - 1).
+ * Parameter2 = 4K naturally aligned real buffer containing block
+ *              list entries
+ * Parameter3 = number of block list entries in the block list, valid
+ *              values are between 0 and 256
+ */
+long cxl_h_collect_vpd(u64 unit_address, u64 record, u64 list_address,
+		       u64 num, u64 *out);
+
+/**
+ * cxl_h_get_fn_error_interrupt - Read the function-wide error data based on an interrupt
+ */
+long cxl_h_get_fn_error_interrupt(u64 unit_address, u64 *reg);
+
+/**
+ * cxl_h_ack_fn_error_interrupt - Acknowledge function-wide error data
+ *                                based on an interrupt
+ * Parameter1 = value to write to the function-wide error interrupt register
+ */
+long cxl_h_ack_fn_error_interrupt(u64 unit_address, u64 value);
+
+/**
+ * cxl_h_get_error_log - Retrieve the Platform Log ID (PLID) of
+ *                       an error log
+ */
+long cxl_h_get_error_log(u64 unit_address, u64 value);
+
+/**
+ * cxl_h_collect_int_info - Collect interrupt info about a coherent
+ *                          platform function after an interrupt occurred.
+ */
+long cxl_h_collect_int_info(u64 unit_address, u64 process_token,
+			struct cxl_irq_info *info);
+
+/**
+ * cxl_h_control_faults - Control the operation of a coherent platform
+ *                        function after a fault occurs.
+ *
+ * Parameters
+ *    control-mask: value to control the faults
+ *                  looks like PSL_TFC_An shifted >> 32
+ *    reset-mask: mask to control reset of function faults
+ *                Set reset_mask = 1 to reset PSL errors
+ */
+long cxl_h_control_faults(u64 unit_address, u64 process_token,
+			u64 control_mask, u64 reset_mask);
+
+/**
+ * cxl_h_reset_adapter - Perform a reset to the coherent platform facility.
+ */
+long cxl_h_reset_adapter(u64 unit_address);
+
+/**
+ * cxl_h_collect_vpd - Collect VPD for the coherent platform function.
+ * Parameter1 = 4K naturally aligned real buffer containing block
+ *              list entries
+ * Parameter2 = number of block list entries in the block list, valid
+ *              values are between 0 and 256
+ */
+long cxl_h_collect_vpd_adapter(u64 unit_address, u64 list_address,
+			       u64 num, u64 *out);
+
+/**
+ * cxl_h_download_adapter_image - Download the base image to the coherent
+ *                                platform facility.
+ */
+long cxl_h_download_adapter_image(u64 unit_address,
+				  u64 list_address, u64 num,
+				  u64 *out);
+
+/**
+ * cxl_h_validate_adapter_image - Validate the base image in the coherent
+ *                                platform facility.
+ */
+long cxl_h_validate_adapter_image(u64 unit_address,
+				  u64 list_address, u64 num,
+				  u64 *out);
+#endif /* _HCALLS_H */
diff --git a/drivers/misc/cxl/irq.c b/drivers/misc/cxl/irq.c
index 09a406058c46..be646dc41a2c 100644
--- a/drivers/misc/cxl/irq.c
+++ b/drivers/misc/cxl/irq.c
@@ -19,70 +19,11 @@
 #include "cxl.h"
 #include "trace.h"
 
-/* XXX: This is implementation specific */
-static irqreturn_t handle_psl_slice_error(struct cxl_context *ctx, u64 dsisr, u64 errstat)
+static int afu_irq_range_start(void)
 {
-	u64 fir1, fir2, fir_slice, serr, afu_debug;
-
-	fir1 = cxl_p1_read(ctx->afu->adapter, CXL_PSL_FIR1);
-	fir2 = cxl_p1_read(ctx->afu->adapter, CXL_PSL_FIR2);
-	fir_slice = cxl_p1n_read(ctx->afu, CXL_PSL_FIR_SLICE_An);
-	serr = cxl_p1n_read(ctx->afu, CXL_PSL_SERR_An);
-	afu_debug = cxl_p1n_read(ctx->afu, CXL_AFU_DEBUG_An);
-
-	dev_crit(&ctx->afu->dev, "PSL ERROR STATUS: 0x%016llx\n", errstat);
-	dev_crit(&ctx->afu->dev, "PSL_FIR1: 0x%016llx\n", fir1);
-	dev_crit(&ctx->afu->dev, "PSL_FIR2: 0x%016llx\n", fir2);
-	dev_crit(&ctx->afu->dev, "PSL_SERR_An: 0x%016llx\n", serr);
-	dev_crit(&ctx->afu->dev, "PSL_FIR_SLICE_An: 0x%016llx\n", fir_slice);
-	dev_crit(&ctx->afu->dev, "CXL_PSL_AFU_DEBUG_An: 0x%016llx\n", afu_debug);
-
-	dev_crit(&ctx->afu->dev, "STOPPING CXL TRACE\n");
-	cxl_stop_trace(ctx->afu->adapter);
-
-	return cxl_ack_irq(ctx, 0, errstat);
-}
-
-irqreturn_t cxl_slice_irq_err(int irq, void *data)
-{
-	struct cxl_afu *afu = data;
-	u64 fir_slice, errstat, serr, afu_debug;
-
-	WARN(irq, "CXL SLICE ERROR interrupt %i\n", irq);
-
-	serr = cxl_p1n_read(afu, CXL_PSL_SERR_An);
-	fir_slice = cxl_p1n_read(afu, CXL_PSL_FIR_SLICE_An);
-	errstat = cxl_p2n_read(afu, CXL_PSL_ErrStat_An);
-	afu_debug = cxl_p1n_read(afu, CXL_AFU_DEBUG_An);
-	dev_crit(&afu->dev, "PSL_SERR_An: 0x%016llx\n", serr);
-	dev_crit(&afu->dev, "PSL_FIR_SLICE_An: 0x%016llx\n", fir_slice);
-	dev_crit(&afu->dev, "CXL_PSL_ErrStat_An: 0x%016llx\n", errstat);
-	dev_crit(&afu->dev, "CXL_PSL_AFU_DEBUG_An: 0x%016llx\n", afu_debug);
-
-	cxl_p1n_write(afu, CXL_PSL_SERR_An, serr);
-
-	return IRQ_HANDLED;
-}
-
-static irqreturn_t cxl_irq_err(int irq, void *data)
-{
-	struct cxl *adapter = data;
-	u64 fir1, fir2, err_ivte;
-
-	WARN(1, "CXL ERROR interrupt %i\n", irq);
-
-	err_ivte = cxl_p1_read(adapter, CXL_PSL_ErrIVTE);
-	dev_crit(&adapter->dev, "PSL_ErrIVTE: 0x%016llx\n", err_ivte);
-
-	dev_crit(&adapter->dev, "STOPPING CXL TRACE\n");
-	cxl_stop_trace(adapter);
-
-	fir1 = cxl_p1_read(adapter, CXL_PSL_FIR1);
-	fir2 = cxl_p1_read(adapter, CXL_PSL_FIR2);
-
-	dev_crit(&adapter->dev, "PSL_FIR1: 0x%016llx\nPSL_FIR2: 0x%016llx\n", fir1, fir2);
-
-	return IRQ_HANDLED;
+	if (cpu_has_feature(CPU_FTR_HVMODE))
+		return 1;
+	return 0;
 }
 
 static irqreturn_t schedule_cxl_fault(struct cxl_context *ctx, u64 dsisr, u64 dar)
@@ -93,9 +34,8 @@ static irqreturn_t schedule_cxl_fault(struct cxl_context *ctx, u64 dsisr, u64 da
 	return IRQ_HANDLED;
 }
 
-static irqreturn_t cxl_irq(int irq, void *data, struct cxl_irq_info *irq_info)
+irqreturn_t cxl_irq(int irq, struct cxl_context *ctx, struct cxl_irq_info *irq_info)
 {
-	struct cxl_context *ctx = data;
 	u64 dsisr, dar;
 
 	dsisr = irq_info->dsisr;
@@ -145,7 +85,8 @@ static irqreturn_t cxl_irq(int irq, void *data, struct cxl_irq_info *irq_info)
 	if (dsisr & CXL_PSL_DSISR_An_UR)
 		pr_devel("CXL interrupt: AURP PTE not found\n");
 	if (dsisr & CXL_PSL_DSISR_An_PE)
-		return handle_psl_slice_error(ctx, dsisr, irq_info->errstat);
+		return cxl_ops->handle_psl_slice_error(ctx, dsisr,
+						irq_info->errstat);
 	if (dsisr & CXL_PSL_DSISR_An_AE) {
 		pr_devel("CXL interrupt: AFU Error 0x%016llx\n", irq_info->afu_err);
 
@@ -169,7 +110,7 @@ static irqreturn_t cxl_irq(int irq, void *data, struct cxl_irq_info *irq_info)
 			wake_up_all(&ctx->wq);
 		}
 
-		cxl_ack_irq(ctx, CXL_PSL_TFC_An_A, 0);
+		cxl_ops->ack_irq(ctx, CXL_PSL_TFC_An_A, 0);
 		return IRQ_HANDLED;
 	}
 	if (dsisr & CXL_PSL_DSISR_An_OC)
@@ -179,54 +120,27 @@ static irqreturn_t cxl_irq(int irq, void *data, struct cxl_irq_info *irq_info)
 	return IRQ_HANDLED;
 }
 
-static irqreturn_t fail_psl_irq(struct cxl_afu *afu, struct cxl_irq_info *irq_info)
-{
-	if (irq_info->dsisr & CXL_PSL_DSISR_TRANS)
-		cxl_p2n_write(afu, CXL_PSL_TFC_An, CXL_PSL_TFC_An_AE);
-	else
-		cxl_p2n_write(afu, CXL_PSL_TFC_An, CXL_PSL_TFC_An_A);
-
-	return IRQ_HANDLED;
-}
-
-static irqreturn_t cxl_irq_multiplexed(int irq, void *data)
-{
-	struct cxl_afu *afu = data;
-	struct cxl_context *ctx;
-	struct cxl_irq_info irq_info;
-	int ph = cxl_p2n_read(afu, CXL_PSL_PEHandle_An) & 0xffff;
-	int ret;
-
-	if ((ret = cxl_get_irq(afu, &irq_info))) {
-		WARN(1, "Unable to get CXL IRQ Info: %i\n", ret);
-		return fail_psl_irq(afu, &irq_info);
-	}
-
-	rcu_read_lock();
-	ctx = idr_find(&afu->contexts_idr, ph);
-	if (ctx) {
-		ret = cxl_irq(irq, ctx, &irq_info);
-		rcu_read_unlock();
-		return ret;
-	}
-	rcu_read_unlock();
-
-	WARN(1, "Unable to demultiplex CXL PSL IRQ for PE %i DSISR %016llx DAR"
-		" %016llx\n(Possible AFU HW issue - was a term/remove acked"
-		" with outstanding transactions?)\n", ph, irq_info.dsisr,
-		irq_info.dar);
-	return fail_psl_irq(afu, &irq_info);
-}
-
 static irqreturn_t cxl_irq_afu(int irq, void *data)
 {
 	struct cxl_context *ctx = data;
 	irq_hw_number_t hwirq = irqd_to_hwirq(irq_get_irq_data(irq));
-	int irq_off, afu_irq = 1;
+	int irq_off, afu_irq = 0;
 	__u16 range;
 	int r;
 
-	for (r = 1; r < CXL_IRQ_RANGES; r++) {
+	/*
+	 * Look for the interrupt number.
+	 * On bare-metal, we know range 0 only contains the PSL
+	 * interrupt so we could start counting at range 1 and initialize
+	 * afu_irq at 1.
+	 * In a guest, range 0 also contains AFU interrupts, so it must
+	 * be counted for. Therefore we initialize afu_irq at 0 to take into
+	 * account the PSL interrupt.
+	 *
+	 * For code-readability, it just seems easier to go over all
+	 * the ranges on bare-metal and guest. The end result is the same.
+	 */
+	for (r = 0; r < CXL_IRQ_RANGES; r++) {
 		irq_off = hwirq - ctx->irqs.offset[r];
 		range = ctx->irqs.range[r];
 		if (irq_off >= 0 && irq_off < range) {
@@ -236,7 +150,7 @@ static irqreturn_t cxl_irq_afu(int irq, void *data)
 		afu_irq += range;
 	}
 	if (unlikely(r >= CXL_IRQ_RANGES)) {
-		WARN(1, "Recieved AFU IRQ out of range for pe %i (virq %i hwirq %lx)\n",
+		WARN(1, "Received AFU IRQ out of range for pe %i (virq %i hwirq %lx)\n",
 		     ctx->pe, irq, hwirq);
 		return IRQ_HANDLED;
 	}
@@ -246,7 +160,7 @@ static irqreturn_t cxl_irq_afu(int irq, void *data)
 	       afu_irq, ctx->pe, irq, hwirq);
 
 	if (unlikely(!ctx->irq_bitmap)) {
-		WARN(1, "Recieved AFU IRQ for context with no IRQ bitmap\n");
+		WARN(1, "Received AFU IRQ for context with no IRQ bitmap\n");
 		return IRQ_HANDLED;
 	}
 	spin_lock(&ctx->lock);
@@ -272,7 +186,8 @@ unsigned int cxl_map_irq(struct cxl *adapter, irq_hw_number_t hwirq,
 		return 0;
 	}
 
-	cxl_setup_irq(adapter, hwirq, virq);
+	if (cxl_ops->setup_irq)
+		cxl_ops->setup_irq(adapter, hwirq, virq);
 
 	pr_devel("hwirq %#lx mapped to virq %u\n", hwirq, virq);
 
@@ -291,16 +206,16 @@ void cxl_unmap_irq(unsigned int virq, void *cookie)
 	irq_dispose_mapping(virq);
 }
 
-static int cxl_register_one_irq(struct cxl *adapter,
-				irq_handler_t handler,
-				void *cookie,
-				irq_hw_number_t *dest_hwirq,
-				unsigned int *dest_virq,
-				const char *name)
+int cxl_register_one_irq(struct cxl *adapter,
+			irq_handler_t handler,
+			void *cookie,
+			irq_hw_number_t *dest_hwirq,
+			unsigned int *dest_virq,
+			const char *name)
 {
 	int hwirq, virq;
 
-	if ((hwirq = cxl_alloc_one_irq(adapter)) < 0)
+	if ((hwirq = cxl_ops->alloc_one_irq(adapter)) < 0)
 		return hwirq;
 
 	if (!(virq = cxl_map_irq(adapter, hwirq, handler, cookie, name)))
@@ -312,108 +227,10 @@ static int cxl_register_one_irq(struct cxl *adapter,
 	return 0;
 
 err:
-	cxl_release_one_irq(adapter, hwirq);
+	cxl_ops->release_one_irq(adapter, hwirq);
 	return -ENOMEM;
 }
 
-int cxl_register_psl_err_irq(struct cxl *adapter)
-{
-	int rc;
-
-	adapter->irq_name = kasprintf(GFP_KERNEL, "cxl-%s-err",
-				      dev_name(&adapter->dev));
-	if (!adapter->irq_name)
-		return -ENOMEM;
-
-	if ((rc = cxl_register_one_irq(adapter, cxl_irq_err, adapter,
-				       &adapter->err_hwirq,
-				       &adapter->err_virq,
-				       adapter->irq_name))) {
-		kfree(adapter->irq_name);
-		adapter->irq_name = NULL;
-		return rc;
-	}
-
-	cxl_p1_write(adapter, CXL_PSL_ErrIVTE, adapter->err_hwirq & 0xffff);
-
-	return 0;
-}
-
-void cxl_release_psl_err_irq(struct cxl *adapter)
-{
-	if (adapter->err_virq != irq_find_mapping(NULL, adapter->err_hwirq))
-		return;
-
-	cxl_p1_write(adapter, CXL_PSL_ErrIVTE, 0x0000000000000000);
-	cxl_unmap_irq(adapter->err_virq, adapter);
-	cxl_release_one_irq(adapter, adapter->err_hwirq);
-	kfree(adapter->irq_name);
-}
-
-int cxl_register_serr_irq(struct cxl_afu *afu)
-{
-	u64 serr;
-	int rc;
-
-	afu->err_irq_name = kasprintf(GFP_KERNEL, "cxl-%s-err",
-				      dev_name(&afu->dev));
-	if (!afu->err_irq_name)
-		return -ENOMEM;
-
-	if ((rc = cxl_register_one_irq(afu->adapter, cxl_slice_irq_err, afu,
-				       &afu->serr_hwirq,
-				       &afu->serr_virq, afu->err_irq_name))) {
-		kfree(afu->err_irq_name);
-		afu->err_irq_name = NULL;
-		return rc;
-	}
-
-	serr = cxl_p1n_read(afu, CXL_PSL_SERR_An);
-	serr = (serr & 0x00ffffffffff0000ULL) | (afu->serr_hwirq & 0xffff);
-	cxl_p1n_write(afu, CXL_PSL_SERR_An, serr);
-
-	return 0;
-}
-
-void cxl_release_serr_irq(struct cxl_afu *afu)
-{
-	if (afu->serr_virq != irq_find_mapping(NULL, afu->serr_hwirq))
-		return;
-
-	cxl_p1n_write(afu, CXL_PSL_SERR_An, 0x0000000000000000);
-	cxl_unmap_irq(afu->serr_virq, afu);
-	cxl_release_one_irq(afu->adapter, afu->serr_hwirq);
-	kfree(afu->err_irq_name);
-}
-
-int cxl_register_psl_irq(struct cxl_afu *afu)
-{
-	int rc;
-
-	afu->psl_irq_name = kasprintf(GFP_KERNEL, "cxl-%s",
-				      dev_name(&afu->dev));
-	if (!afu->psl_irq_name)
-		return -ENOMEM;
-
-	if ((rc = cxl_register_one_irq(afu->adapter, cxl_irq_multiplexed, afu,
-				    &afu->psl_hwirq, &afu->psl_virq,
-				    afu->psl_irq_name))) {
-		kfree(afu->psl_irq_name);
-		afu->psl_irq_name = NULL;
-	}
-	return rc;
-}
-
-void cxl_release_psl_irq(struct cxl_afu *afu)
-{
-	if (afu->psl_virq != irq_find_mapping(NULL, afu->psl_hwirq))
-		return;
-
-	cxl_unmap_irq(afu->psl_virq, afu);
-	cxl_release_one_irq(afu->adapter, afu->psl_hwirq);
-	kfree(afu->psl_irq_name);
-}
-
 void afu_irq_name_free(struct cxl_context *ctx)
 {
 	struct cxl_irq_name *irq_name, *tmp;
@@ -429,16 +246,33 @@ int afu_allocate_irqs(struct cxl_context *ctx, u32 count)
 {
 	int rc, r, i, j = 1;
 	struct cxl_irq_name *irq_name;
+	int alloc_count;
+
+	/*
+	 * In native mode, range 0 is reserved for the multiplexed
+	 * PSL interrupt. It has been allocated when the AFU was initialized.
+	 *
+	 * In a guest, the PSL interrupt is not mutliplexed, but per-context,
+	 * and is the first interrupt from range 0. It still needs to be
+	 * allocated, so bump the count by one.
+	 */
+	if (cpu_has_feature(CPU_FTR_HVMODE))
+		alloc_count = count;
+	else
+		alloc_count = count + 1;
 
 	/* Initialize the list head to hold irq names */
 	INIT_LIST_HEAD(&ctx->irq_names);
 
-	if ((rc = cxl_alloc_irq_ranges(&ctx->irqs, ctx->afu->adapter, count)))
+	if ((rc = cxl_ops->alloc_irq_ranges(&ctx->irqs, ctx->afu->adapter,
+							alloc_count)))
 		return rc;
 
-	/* Multiplexed PSL Interrupt */
-	ctx->irqs.offset[0] = ctx->afu->psl_hwirq;
-	ctx->irqs.range[0] = 1;
+	if (cpu_has_feature(CPU_FTR_HVMODE)) {
+		/* Multiplexed PSL Interrupt */
+		ctx->irqs.offset[0] = ctx->afu->native->psl_hwirq;
+		ctx->irqs.range[0] = 1;
+	}
 
 	ctx->irq_count = count;
 	ctx->irq_bitmap = kcalloc(BITS_TO_LONGS(count),
@@ -450,7 +284,7 @@ int afu_allocate_irqs(struct cxl_context *ctx, u32 count)
 	 * Allocate names first.  If any fail, bail out before allocating
 	 * actual hardware IRQs.
 	 */
-	for (r = 1; r < CXL_IRQ_RANGES; r++) {
+	for (r = afu_irq_range_start(); r < CXL_IRQ_RANGES; r++) {
 		for (i = 0; i < ctx->irqs.range[r]; i++) {
 			irq_name = kmalloc(sizeof(struct cxl_irq_name),
 					   GFP_KERNEL);
@@ -471,7 +305,7 @@ int afu_allocate_irqs(struct cxl_context *ctx, u32 count)
 	return 0;
 
 out:
-	cxl_release_irq_ranges(&ctx->irqs, ctx->afu->adapter);
+	cxl_ops->release_irq_ranges(&ctx->irqs, ctx->afu->adapter);
 	afu_irq_name_free(ctx);
 	return -ENOMEM;
 }
@@ -480,15 +314,30 @@ static void afu_register_hwirqs(struct cxl_context *ctx)
 {
 	irq_hw_number_t hwirq;
 	struct cxl_irq_name *irq_name;
-	int r,i;
+	int r, i;
+	irqreturn_t (*handler)(int irq, void *data);
 
 	/* We've allocated all memory now, so let's do the irq allocations */
 	irq_name = list_first_entry(&ctx->irq_names, struct cxl_irq_name, list);
-	for (r = 1; r < CXL_IRQ_RANGES; r++) {
+	for (r = afu_irq_range_start(); r < CXL_IRQ_RANGES; r++) {
 		hwirq = ctx->irqs.offset[r];
 		for (i = 0; i < ctx->irqs.range[r]; hwirq++, i++) {
-			cxl_map_irq(ctx->afu->adapter, hwirq,
-				    cxl_irq_afu, ctx, irq_name->name);
+			if (r == 0 && i == 0)
+				/*
+				 * The very first interrupt of range 0 is
+				 * always the PSL interrupt, but we only
+				 * need to connect a handler for guests,
+				 * because there's one PSL interrupt per
+				 * context.
+				 * On bare-metal, the PSL interrupt is
+				 * multiplexed and was setup when the AFU
+				 * was configured.
+				 */
+				handler = cxl_ops->psl_interrupt;
+			else
+				handler = cxl_irq_afu;
+			cxl_map_irq(ctx->afu->adapter, hwirq, handler, ctx,
+				irq_name->name);
 			irq_name = list_next_entry(irq_name, list);
 		}
 	}
@@ -504,7 +353,7 @@ int afu_register_irqs(struct cxl_context *ctx, u32 count)
 
 	afu_register_hwirqs(ctx);
 	return 0;
- }
+}
 
 void afu_release_irqs(struct cxl_context *ctx, void *cookie)
 {
@@ -512,7 +361,7 @@ void afu_release_irqs(struct cxl_context *ctx, void *cookie)
 	unsigned int virq;
 	int r, i;
 
-	for (r = 1; r < CXL_IRQ_RANGES; r++) {
+	for (r = afu_irq_range_start(); r < CXL_IRQ_RANGES; r++) {
 		hwirq = ctx->irqs.offset[r];
 		for (i = 0; i < ctx->irqs.range[r]; hwirq++, i++) {
 			virq = irq_find_mapping(NULL, hwirq);
@@ -522,7 +371,7 @@ void afu_release_irqs(struct cxl_context *ctx, void *cookie)
 	}
 
 	afu_irq_name_free(ctx);
-	cxl_release_irq_ranges(&ctx->irqs, ctx->afu->adapter);
+	cxl_ops->release_irq_ranges(&ctx->irqs, ctx->afu->adapter);
 
 	ctx->irq_count = 0;
 }
diff --git a/drivers/misc/cxl/main.c b/drivers/misc/cxl/main.c
index 9fde75ed4fac..ae68c3201156 100644
--- a/drivers/misc/cxl/main.c
+++ b/drivers/misc/cxl/main.c
@@ -32,6 +32,29 @@ uint cxl_verbose;
 module_param_named(verbose, cxl_verbose, uint, 0600);
 MODULE_PARM_DESC(verbose, "Enable verbose dmesg output");
 
+const struct cxl_backend_ops *cxl_ops;
+
+int cxl_afu_slbia(struct cxl_afu *afu)
+{
+	unsigned long timeout = jiffies + (HZ * CXL_TIMEOUT);
+
+	pr_devel("cxl_afu_slbia issuing SLBIA command\n");
+	cxl_p2n_write(afu, CXL_SLBIA_An, CXL_TLB_SLB_IQ_ALL);
+	while (cxl_p2n_read(afu, CXL_SLBIA_An) & CXL_TLB_SLB_P) {
+		if (time_after_eq(jiffies, timeout)) {
+			dev_warn(&afu->dev, "WARNING: CXL AFU SLBIA timed out!\n");
+			return -EBUSY;
+		}
+		/* If the adapter has gone down, we can assume that we
+		 * will PERST it and that will invalidate everything.
+		 */
+		if (!cxl_ops->link_ok(afu->adapter, afu))
+			return -EIO;
+		cpu_relax();
+	}
+	return 0;
+}
+
 static inline void _cxl_slbia(struct cxl_context *ctx, struct mm_struct *mm)
 {
 	struct task_struct *task;
@@ -139,6 +162,32 @@ int cxl_alloc_sst(struct cxl_context *ctx)
 	return 0;
 }
 
+/* print buffer content as integers when debugging */
+void cxl_dump_debug_buffer(void *buf, size_t buf_len)
+{
+#ifdef DEBUG
+	int i, *ptr;
+
+	/*
+	 * We want to regroup up to 4 integers per line, which means they
+	 * need to be in the same pr_devel() statement
+	 */
+	ptr = (int *) buf;
+	for (i = 0; i * 4 < buf_len; i += 4) {
+		if ((i + 3) * 4 < buf_len)
+			pr_devel("%.8x %.8x %.8x %.8x\n", ptr[i], ptr[i + 1],
+				ptr[i + 2], ptr[i + 3]);
+		else if ((i + 2) * 4 < buf_len)
+			pr_devel("%.8x %.8x %.8x\n", ptr[i], ptr[i + 1],
+				ptr[i + 2]);
+		else if ((i + 1) * 4 < buf_len)
+			pr_devel("%.8x %.8x\n", ptr[i], ptr[i + 1]);
+		else
+			pr_devel("%.8x\n", ptr[i]);
+	}
+#endif /* DEBUG */
+}
+
 /* Find a CXL adapter by it's number and increase it's refcount */
 struct cxl *get_cxl_adapter(int num)
 {
@@ -152,7 +201,7 @@ struct cxl *get_cxl_adapter(int num)
 	return adapter;
 }
 
-int cxl_alloc_adapter_nr(struct cxl *adapter)
+static int cxl_alloc_adapter_nr(struct cxl *adapter)
 {
 	int i;
 
@@ -174,13 +223,58 @@ void cxl_remove_adapter_nr(struct cxl *adapter)
 	idr_remove(&cxl_adapter_idr, adapter->adapter_num);
 }
 
+struct cxl *cxl_alloc_adapter(void)
+{
+	struct cxl *adapter;
+
+	if (!(adapter = kzalloc(sizeof(struct cxl), GFP_KERNEL)))
+		return NULL;
+
+	spin_lock_init(&adapter->afu_list_lock);
+
+	if (cxl_alloc_adapter_nr(adapter))
+		goto err1;
+
+	if (dev_set_name(&adapter->dev, "card%i", adapter->adapter_num))
+		goto err2;
+
+	return adapter;
+
+err2:
+	cxl_remove_adapter_nr(adapter);
+err1:
+	kfree(adapter);
+	return NULL;
+}
+
+struct cxl_afu *cxl_alloc_afu(struct cxl *adapter, int slice)
+{
+	struct cxl_afu *afu;
+
+	if (!(afu = kzalloc(sizeof(struct cxl_afu), GFP_KERNEL)))
+		return NULL;
+
+	afu->adapter = adapter;
+	afu->dev.parent = &adapter->dev;
+	afu->dev.release = cxl_ops->release_afu;
+	afu->slice = slice;
+	idr_init(&afu->contexts_idr);
+	mutex_init(&afu->contexts_lock);
+	spin_lock_init(&afu->afu_cntl_lock);
+
+	afu->prefault_mode = CXL_PREFAULT_NONE;
+	afu->irqs_max = afu->adapter->user_irqs;
+
+	return afu;
+}
+
 int cxl_afu_select_best_mode(struct cxl_afu *afu)
 {
 	if (afu->modes_supported & CXL_MODE_DIRECTED)
-		return cxl_afu_activate_mode(afu, CXL_MODE_DIRECTED);
+		return cxl_ops->afu_activate_mode(afu, CXL_MODE_DIRECTED);
 
 	if (afu->modes_supported & CXL_MODE_DEDICATED)
-		return cxl_afu_activate_mode(afu, CXL_MODE_DEDICATED);
+		return cxl_ops->afu_activate_mode(afu, CXL_MODE_DEDICATED);
 
 	dev_warn(&afu->dev, "No supported programming modes available\n");
 	/* We don't fail this so the user can inspect sysfs */
@@ -191,9 +285,6 @@ static int __init init_cxl(void)
 {
 	int rc = 0;
 
-	if (!cpu_has_feature(CPU_FTR_HVMODE))
-		return -EPERM;
-
 	if ((rc = cxl_file_init()))
 		return rc;
 
@@ -202,7 +293,17 @@ static int __init init_cxl(void)
 	if ((rc = register_cxl_calls(&cxl_calls)))
 		goto err;
 
-	if ((rc = pci_register_driver(&cxl_pci_driver)))
+	if (cpu_has_feature(CPU_FTR_HVMODE)) {
+		cxl_ops = &cxl_native_ops;
+		rc = pci_register_driver(&cxl_pci_driver);
+	}
+#ifdef CONFIG_PPC_PSERIES
+	else {
+		cxl_ops = &cxl_guest_ops;
+		rc = platform_driver_register(&cxl_of_driver);
+	}
+#endif
+	if (rc)
 		goto err1;
 
 	return 0;
@@ -217,7 +318,12 @@ err:
 
 static void exit_cxl(void)
 {
-	pci_unregister_driver(&cxl_pci_driver);
+	if (cpu_has_feature(CPU_FTR_HVMODE))
+		pci_unregister_driver(&cxl_pci_driver);
+#ifdef CONFIG_PPC_PSERIES
+	else
+		platform_driver_unregister(&cxl_of_driver);
+#endif
 
 	cxl_debugfs_exit();
 	cxl_file_exit();
diff --git a/drivers/misc/cxl/native.c b/drivers/misc/cxl/native.c
index f40909793490..387fcbdf9793 100644
--- a/drivers/misc/cxl/native.c
+++ b/drivers/misc/cxl/native.c
@@ -42,7 +42,7 @@ static int afu_control(struct cxl_afu *afu, u64 command,
 			goto out;
 		}
 
-		if (!cxl_adapter_link_ok(afu->adapter)) {
+		if (!cxl_ops->link_ok(afu->adapter, afu)) {
 			afu->enabled = enabled;
 			rc = -EIO;
 			goto out;
@@ -80,7 +80,7 @@ int cxl_afu_disable(struct cxl_afu *afu)
 }
 
 /* This will disable as well as reset */
-int __cxl_afu_reset(struct cxl_afu *afu)
+static int native_afu_reset(struct cxl_afu *afu)
 {
 	pr_devel("AFU reset request\n");
 
@@ -90,9 +90,9 @@ int __cxl_afu_reset(struct cxl_afu *afu)
 			   false);
 }
 
-int cxl_afu_check_and_enable(struct cxl_afu *afu)
+static int native_afu_check_and_enable(struct cxl_afu *afu)
 {
-	if (!cxl_adapter_link_ok(afu->adapter)) {
+	if (!cxl_ops->link_ok(afu->adapter, afu)) {
 		WARN(1, "Refusing to enable afu while link down!\n");
 		return -EIO;
 	}
@@ -114,7 +114,7 @@ int cxl_psl_purge(struct cxl_afu *afu)
 
 	pr_devel("PSL purge request\n");
 
-	if (!cxl_adapter_link_ok(afu->adapter)) {
+	if (!cxl_ops->link_ok(afu->adapter, afu)) {
 		dev_warn(&afu->dev, "PSL Purge called with link down, ignoring\n");
 		rc = -EIO;
 		goto out;
@@ -136,7 +136,7 @@ int cxl_psl_purge(struct cxl_afu *afu)
 			rc = -EBUSY;
 			goto out;
 		}
-		if (!cxl_adapter_link_ok(afu->adapter)) {
+		if (!cxl_ops->link_ok(afu->adapter, afu)) {
 			rc = -EIO;
 			goto out;
 		}
@@ -186,22 +186,22 @@ static int spa_max_procs(int spa_size)
 int cxl_alloc_spa(struct cxl_afu *afu)
 {
 	/* Work out how many pages to allocate */
-	afu->spa_order = 0;
+	afu->native->spa_order = 0;
 	do {
-		afu->spa_order++;
-		afu->spa_size = (1 << afu->spa_order) * PAGE_SIZE;
-		afu->spa_max_procs = spa_max_procs(afu->spa_size);
-	} while (afu->spa_max_procs < afu->num_procs);
+		afu->native->spa_order++;
+		afu->native->spa_size = (1 << afu->native->spa_order) * PAGE_SIZE;
+		afu->native->spa_max_procs = spa_max_procs(afu->native->spa_size);
+	} while (afu->native->spa_max_procs < afu->num_procs);
 
-	WARN_ON(afu->spa_size > 0x100000); /* Max size supported by the hardware */
+	WARN_ON(afu->native->spa_size > 0x100000); /* Max size supported by the hardware */
 
-	if (!(afu->spa = (struct cxl_process_element *)
-	      __get_free_pages(GFP_KERNEL | __GFP_ZERO, afu->spa_order))) {
+	if (!(afu->native->spa = (struct cxl_process_element *)
+	      __get_free_pages(GFP_KERNEL | __GFP_ZERO, afu->native->spa_order))) {
 		pr_err("cxl_alloc_spa: Unable to allocate scheduled process area\n");
 		return -ENOMEM;
 	}
 	pr_devel("spa pages: %i afu->spa_max_procs: %i   afu->num_procs: %i\n",
-		 1<<afu->spa_order, afu->spa_max_procs, afu->num_procs);
+		 1<<afu->native->spa_order, afu->native->spa_max_procs, afu->num_procs);
 
 	return 0;
 }
@@ -210,13 +210,15 @@ static void attach_spa(struct cxl_afu *afu)
 {
 	u64 spap;
 
-	afu->sw_command_status = (__be64 *)((char *)afu->spa +
-					    ((afu->spa_max_procs + 3) * 128));
+	afu->native->sw_command_status = (__be64 *)((char *)afu->native->spa +
+					    ((afu->native->spa_max_procs + 3) * 128));
 
-	spap = virt_to_phys(afu->spa) & CXL_PSL_SPAP_Addr;
-	spap |= ((afu->spa_size >> (12 - CXL_PSL_SPAP_Size_Shift)) - 1) & CXL_PSL_SPAP_Size;
+	spap = virt_to_phys(afu->native->spa) & CXL_PSL_SPAP_Addr;
+	spap |= ((afu->native->spa_size >> (12 - CXL_PSL_SPAP_Size_Shift)) - 1) & CXL_PSL_SPAP_Size;
 	spap |= CXL_PSL_SPAP_V;
-	pr_devel("cxl: SPA allocated at 0x%p. Max processes: %i, sw_command_status: 0x%p CXL_PSL_SPAP_An=0x%016llx\n", afu->spa, afu->spa_max_procs, afu->sw_command_status, spap);
+	pr_devel("cxl: SPA allocated at 0x%p. Max processes: %i, sw_command_status: 0x%p CXL_PSL_SPAP_An=0x%016llx\n",
+		afu->native->spa, afu->native->spa_max_procs,
+		afu->native->sw_command_status, spap);
 	cxl_p1n_write(afu, CXL_PSL_SPAP_An, spap);
 }
 
@@ -227,9 +229,10 @@ static inline void detach_spa(struct cxl_afu *afu)
 
 void cxl_release_spa(struct cxl_afu *afu)
 {
-	if (afu->spa) {
-		free_pages((unsigned long) afu->spa, afu->spa_order);
-		afu->spa = NULL;
+	if (afu->native->spa) {
+		free_pages((unsigned long) afu->native->spa,
+			afu->native->spa_order);
+		afu->native->spa = NULL;
 	}
 }
 
@@ -247,7 +250,7 @@ int cxl_tlb_slb_invalidate(struct cxl *adapter)
 			dev_warn(&adapter->dev, "WARNING: CXL adapter wide TLBIA timed out!\n");
 			return -EBUSY;
 		}
-		if (!cxl_adapter_link_ok(adapter))
+		if (!cxl_ops->link_ok(adapter, NULL))
 			return -EIO;
 		cpu_relax();
 	}
@@ -258,28 +261,7 @@ int cxl_tlb_slb_invalidate(struct cxl *adapter)
 			dev_warn(&adapter->dev, "WARNING: CXL adapter wide SLBIA timed out!\n");
 			return -EBUSY;
 		}
-		if (!cxl_adapter_link_ok(adapter))
-			return -EIO;
-		cpu_relax();
-	}
-	return 0;
-}
-
-int cxl_afu_slbia(struct cxl_afu *afu)
-{
-	unsigned long timeout = jiffies + (HZ * CXL_TIMEOUT);
-
-	pr_devel("cxl_afu_slbia issuing SLBIA command\n");
-	cxl_p2n_write(afu, CXL_SLBIA_An, CXL_TLB_SLB_IQ_ALL);
-	while (cxl_p2n_read(afu, CXL_SLBIA_An) & CXL_TLB_SLB_P) {
-		if (time_after_eq(jiffies, timeout)) {
-			dev_warn(&afu->dev, "WARNING: CXL AFU SLBIA timed out!\n");
-			return -EBUSY;
-		}
-		/* If the adapter has gone down, we can assume that we
-		 * will PERST it and that will invalidate everything.
-		 */
-		if (!cxl_adapter_link_ok(afu->adapter))
+		if (!cxl_ops->link_ok(adapter, NULL))
 			return -EIO;
 		cpu_relax();
 	}
@@ -312,7 +294,7 @@ static void slb_invalid(struct cxl_context *ctx)
 	struct cxl *adapter = ctx->afu->adapter;
 	u64 slbia;
 
-	WARN_ON(!mutex_is_locked(&ctx->afu->spa_mutex));
+	WARN_ON(!mutex_is_locked(&ctx->afu->native->spa_mutex));
 
 	cxl_p1_write(adapter, CXL_PSL_LBISEL,
 			((u64)be32_to_cpu(ctx->elem->common.pid) << 32) |
@@ -320,7 +302,7 @@ static void slb_invalid(struct cxl_context *ctx)
 	cxl_p1_write(adapter, CXL_PSL_SLBIA, CXL_TLB_SLB_IQ_LPIDPID);
 
 	while (1) {
-		if (!cxl_adapter_link_ok(adapter))
+		if (!cxl_ops->link_ok(adapter, NULL))
 			break;
 		slbia = cxl_p1_read(adapter, CXL_PSL_SLBIA);
 		if (!(slbia & CXL_TLB_SLB_P))
@@ -342,7 +324,7 @@ static int do_process_element_cmd(struct cxl_context *ctx,
 
 	ctx->elem->software_state = cpu_to_be32(pe_state);
 	smp_wmb();
-	*(ctx->afu->sw_command_status) = cpu_to_be64(cmd | 0 | ctx->pe);
+	*(ctx->afu->native->sw_command_status) = cpu_to_be64(cmd | 0 | ctx->pe);
 	smp_mb();
 	cxl_p1n_write(ctx->afu, CXL_PSL_LLCMD_An, cmd | ctx->pe);
 	while (1) {
@@ -351,12 +333,12 @@ static int do_process_element_cmd(struct cxl_context *ctx,
 			rc = -EBUSY;
 			goto out;
 		}
-		if (!cxl_adapter_link_ok(ctx->afu->adapter)) {
+		if (!cxl_ops->link_ok(ctx->afu->adapter, ctx->afu)) {
 			dev_warn(&ctx->afu->dev, "WARNING: Device link down, aborting Process Element Command!\n");
 			rc = -EIO;
 			goto out;
 		}
-		state = be64_to_cpup(ctx->afu->sw_command_status);
+		state = be64_to_cpup(ctx->afu->native->sw_command_status);
 		if (state == ~0ULL) {
 			pr_err("cxl: Error adding process element to AFU\n");
 			rc = -1;
@@ -384,12 +366,12 @@ static int add_process_element(struct cxl_context *ctx)
 {
 	int rc = 0;
 
-	mutex_lock(&ctx->afu->spa_mutex);
+	mutex_lock(&ctx->afu->native->spa_mutex);
 	pr_devel("%s Adding pe: %i started\n", __func__, ctx->pe);
 	if (!(rc = do_process_element_cmd(ctx, CXL_SPA_SW_CMD_ADD, CXL_PE_SOFTWARE_STATE_V)))
 		ctx->pe_inserted = true;
 	pr_devel("%s Adding pe: %i finished\n", __func__, ctx->pe);
-	mutex_unlock(&ctx->afu->spa_mutex);
+	mutex_unlock(&ctx->afu->native->spa_mutex);
 	return rc;
 }
 
@@ -401,18 +383,18 @@ static int terminate_process_element(struct cxl_context *ctx)
 	if (!(ctx->elem->software_state & cpu_to_be32(CXL_PE_SOFTWARE_STATE_V)))
 		return rc;
 
-	mutex_lock(&ctx->afu->spa_mutex);
+	mutex_lock(&ctx->afu->native->spa_mutex);
 	pr_devel("%s Terminate pe: %i started\n", __func__, ctx->pe);
 	/* We could be asked to terminate when the hw is down. That
 	 * should always succeed: it's not running if the hw has gone
 	 * away and is being reset.
 	 */
-	if (cxl_adapter_link_ok(ctx->afu->adapter))
+	if (cxl_ops->link_ok(ctx->afu->adapter, ctx->afu))
 		rc = do_process_element_cmd(ctx, CXL_SPA_SW_CMD_TERMINATE,
 					    CXL_PE_SOFTWARE_STATE_V | CXL_PE_SOFTWARE_STATE_T);
 	ctx->elem->software_state = 0;	/* Remove Valid bit */
 	pr_devel("%s Terminate pe: %i finished\n", __func__, ctx->pe);
-	mutex_unlock(&ctx->afu->spa_mutex);
+	mutex_unlock(&ctx->afu->native->spa_mutex);
 	return rc;
 }
 
@@ -420,20 +402,20 @@ static int remove_process_element(struct cxl_context *ctx)
 {
 	int rc = 0;
 
-	mutex_lock(&ctx->afu->spa_mutex);
+	mutex_lock(&ctx->afu->native->spa_mutex);
 	pr_devel("%s Remove pe: %i started\n", __func__, ctx->pe);
 
 	/* We could be asked to remove when the hw is down. Again, if
 	 * the hw is down, the PE is gone, so we succeed.
 	 */
-	if (cxl_adapter_link_ok(ctx->afu->adapter))
+	if (cxl_ops->link_ok(ctx->afu->adapter, ctx->afu))
 		rc = do_process_element_cmd(ctx, CXL_SPA_SW_CMD_REMOVE, 0);
 
 	if (!rc)
 		ctx->pe_inserted = false;
 	slb_invalid(ctx);
 	pr_devel("%s Remove pe: %i finished\n", __func__, ctx->pe);
-	mutex_unlock(&ctx->afu->spa_mutex);
+	mutex_unlock(&ctx->afu->native->spa_mutex);
 
 	return rc;
 }
@@ -446,7 +428,7 @@ void cxl_assign_psn_space(struct cxl_context *ctx)
 		ctx->psn_size = ctx->afu->adapter->ps_size;
 	} else {
 		ctx->psn_phys = ctx->afu->psn_phys +
-			(ctx->afu->pp_offset + ctx->afu->pp_size * ctx->pe);
+			(ctx->afu->native->pp_offset + ctx->afu->pp_size * ctx->pe);
 		ctx->psn_size = ctx->afu->pp_size;
 	}
 }
@@ -458,7 +440,7 @@ static int activate_afu_directed(struct cxl_afu *afu)
 	dev_info(&afu->dev, "Activating AFU directed mode\n");
 
 	afu->num_procs = afu->max_procs_virtualised;
-	if (afu->spa == NULL) {
+	if (afu->native->spa == NULL) {
 		if (cxl_alloc_spa(afu))
 			return -ENOMEM;
 	}
@@ -552,7 +534,7 @@ static int attach_afu_directed(struct cxl_context *ctx, u64 wed, u64 amr)
 	ctx->elem->common.wed = cpu_to_be64(wed);
 
 	/* first guy needs to enable */
-	if ((result = cxl_afu_check_and_enable(ctx->afu)))
+	if ((result = cxl_ops->afu_check_and_enable(ctx->afu)))
 		return result;
 
 	return add_process_element(ctx);
@@ -568,7 +550,7 @@ static int deactivate_afu_directed(struct cxl_afu *afu)
 	cxl_sysfs_afu_m_remove(afu);
 	cxl_chardev_afu_remove(afu);
 
-	__cxl_afu_reset(afu);
+	cxl_ops->afu_reset(afu);
 	cxl_afu_disable(afu);
 	cxl_psl_purge(afu);
 
@@ -632,7 +614,7 @@ static int attach_dedicated(struct cxl_context *ctx, u64 wed, u64 amr)
 	/* master only context for dedicated */
 	cxl_assign_psn_space(ctx);
 
-	if ((rc = __cxl_afu_reset(afu)))
+	if ((rc = cxl_ops->afu_reset(afu)))
 		return rc;
 
 	cxl_p2n_write(afu, CXL_PSL_WED_An, wed);
@@ -652,7 +634,7 @@ static int deactivate_dedicated_process(struct cxl_afu *afu)
 	return 0;
 }
 
-int _cxl_afu_deactivate_mode(struct cxl_afu *afu, int mode)
+static int native_afu_deactivate_mode(struct cxl_afu *afu, int mode)
 {
 	if (mode == CXL_MODE_DIRECTED)
 		return deactivate_afu_directed(afu);
@@ -661,19 +643,14 @@ int _cxl_afu_deactivate_mode(struct cxl_afu *afu, int mode)
 	return 0;
 }
 
-int cxl_afu_deactivate_mode(struct cxl_afu *afu)
-{
-	return _cxl_afu_deactivate_mode(afu, afu->current_mode);
-}
-
-int cxl_afu_activate_mode(struct cxl_afu *afu, int mode)
+static int native_afu_activate_mode(struct cxl_afu *afu, int mode)
 {
 	if (!mode)
 		return 0;
 	if (!(mode & afu->modes_supported))
 		return -EINVAL;
 
-	if (!cxl_adapter_link_ok(afu->adapter)) {
+	if (!cxl_ops->link_ok(afu->adapter, afu)) {
 		WARN(1, "Device link is down, refusing to activate!\n");
 		return -EIO;
 	}
@@ -686,9 +663,10 @@ int cxl_afu_activate_mode(struct cxl_afu *afu, int mode)
 	return -EINVAL;
 }
 
-int cxl_attach_process(struct cxl_context *ctx, bool kernel, u64 wed, u64 amr)
+static int native_attach_process(struct cxl_context *ctx, bool kernel,
+				u64 wed, u64 amr)
 {
-	if (!cxl_adapter_link_ok(ctx->afu->adapter)) {
+	if (!cxl_ops->link_ok(ctx->afu->adapter, ctx->afu)) {
 		WARN(1, "Device link is down, refusing to attach process!\n");
 		return -EIO;
 	}
@@ -705,7 +683,7 @@ int cxl_attach_process(struct cxl_context *ctx, bool kernel, u64 wed, u64 amr)
 
 static inline int detach_process_native_dedicated(struct cxl_context *ctx)
 {
-	__cxl_afu_reset(ctx->afu);
+	cxl_ops->afu_reset(ctx->afu);
 	cxl_afu_disable(ctx->afu);
 	cxl_psl_purge(ctx->afu);
 	return 0;
@@ -723,7 +701,7 @@ static inline int detach_process_native_afu_directed(struct cxl_context *ctx)
 	return 0;
 }
 
-int cxl_detach_process(struct cxl_context *ctx)
+static int native_detach_process(struct cxl_context *ctx)
 {
 	trace_cxl_detach(ctx);
 
@@ -733,14 +711,14 @@ int cxl_detach_process(struct cxl_context *ctx)
 	return detach_process_native_afu_directed(ctx);
 }
 
-int cxl_get_irq(struct cxl_afu *afu, struct cxl_irq_info *info)
+static int native_get_irq_info(struct cxl_afu *afu, struct cxl_irq_info *info)
 {
 	u64 pidtid;
 
 	/* If the adapter has gone away, we can't get any meaningful
 	 * information.
 	 */
-	if (!cxl_adapter_link_ok(afu->adapter))
+	if (!cxl_ops->link_ok(afu->adapter, afu))
 		return -EIO;
 
 	info->dsisr = cxl_p2n_read(afu, CXL_PSL_DSISR_An);
@@ -751,10 +729,214 @@ int cxl_get_irq(struct cxl_afu *afu, struct cxl_irq_info *info)
 	info->tid = pidtid & 0xffffffff;
 	info->afu_err = cxl_p2n_read(afu, CXL_AFU_ERR_An);
 	info->errstat = cxl_p2n_read(afu, CXL_PSL_ErrStat_An);
+	info->proc_handle = 0;
+
+	return 0;
+}
+
+static irqreturn_t native_handle_psl_slice_error(struct cxl_context *ctx,
+						u64 dsisr, u64 errstat)
+{
+	u64 fir1, fir2, fir_slice, serr, afu_debug;
+
+	fir1 = cxl_p1_read(ctx->afu->adapter, CXL_PSL_FIR1);
+	fir2 = cxl_p1_read(ctx->afu->adapter, CXL_PSL_FIR2);
+	fir_slice = cxl_p1n_read(ctx->afu, CXL_PSL_FIR_SLICE_An);
+	serr = cxl_p1n_read(ctx->afu, CXL_PSL_SERR_An);
+	afu_debug = cxl_p1n_read(ctx->afu, CXL_AFU_DEBUG_An);
+
+	dev_crit(&ctx->afu->dev, "PSL ERROR STATUS: 0x%016llx\n", errstat);
+	dev_crit(&ctx->afu->dev, "PSL_FIR1: 0x%016llx\n", fir1);
+	dev_crit(&ctx->afu->dev, "PSL_FIR2: 0x%016llx\n", fir2);
+	dev_crit(&ctx->afu->dev, "PSL_SERR_An: 0x%016llx\n", serr);
+	dev_crit(&ctx->afu->dev, "PSL_FIR_SLICE_An: 0x%016llx\n", fir_slice);
+	dev_crit(&ctx->afu->dev, "CXL_PSL_AFU_DEBUG_An: 0x%016llx\n", afu_debug);
+
+	dev_crit(&ctx->afu->dev, "STOPPING CXL TRACE\n");
+	cxl_stop_trace(ctx->afu->adapter);
+
+	return cxl_ops->ack_irq(ctx, 0, errstat);
+}
+
+static irqreturn_t fail_psl_irq(struct cxl_afu *afu, struct cxl_irq_info *irq_info)
+{
+	if (irq_info->dsisr & CXL_PSL_DSISR_TRANS)
+		cxl_p2n_write(afu, CXL_PSL_TFC_An, CXL_PSL_TFC_An_AE);
+	else
+		cxl_p2n_write(afu, CXL_PSL_TFC_An, CXL_PSL_TFC_An_A);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t native_irq_multiplexed(int irq, void *data)
+{
+	struct cxl_afu *afu = data;
+	struct cxl_context *ctx;
+	struct cxl_irq_info irq_info;
+	int ph = cxl_p2n_read(afu, CXL_PSL_PEHandle_An) & 0xffff;
+	int ret;
+
+	if ((ret = native_get_irq_info(afu, &irq_info))) {
+		WARN(1, "Unable to get CXL IRQ Info: %i\n", ret);
+		return fail_psl_irq(afu, &irq_info);
+	}
+
+	rcu_read_lock();
+	ctx = idr_find(&afu->contexts_idr, ph);
+	if (ctx) {
+		ret = cxl_irq(irq, ctx, &irq_info);
+		rcu_read_unlock();
+		return ret;
+	}
+	rcu_read_unlock();
+
+	WARN(1, "Unable to demultiplex CXL PSL IRQ for PE %i DSISR %016llx DAR"
+		" %016llx\n(Possible AFU HW issue - was a term/remove acked"
+		" with outstanding transactions?)\n", ph, irq_info.dsisr,
+		irq_info.dar);
+	return fail_psl_irq(afu, &irq_info);
+}
+
+static irqreturn_t native_slice_irq_err(int irq, void *data)
+{
+	struct cxl_afu *afu = data;
+	u64 fir_slice, errstat, serr, afu_debug;
+
+	WARN(irq, "CXL SLICE ERROR interrupt %i\n", irq);
+
+	serr = cxl_p1n_read(afu, CXL_PSL_SERR_An);
+	fir_slice = cxl_p1n_read(afu, CXL_PSL_FIR_SLICE_An);
+	errstat = cxl_p2n_read(afu, CXL_PSL_ErrStat_An);
+	afu_debug = cxl_p1n_read(afu, CXL_AFU_DEBUG_An);
+	dev_crit(&afu->dev, "PSL_SERR_An: 0x%016llx\n", serr);
+	dev_crit(&afu->dev, "PSL_FIR_SLICE_An: 0x%016llx\n", fir_slice);
+	dev_crit(&afu->dev, "CXL_PSL_ErrStat_An: 0x%016llx\n", errstat);
+	dev_crit(&afu->dev, "CXL_PSL_AFU_DEBUG_An: 0x%016llx\n", afu_debug);
+
+	cxl_p1n_write(afu, CXL_PSL_SERR_An, serr);
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t native_irq_err(int irq, void *data)
+{
+	struct cxl *adapter = data;
+	u64 fir1, fir2, err_ivte;
+
+	WARN(1, "CXL ERROR interrupt %i\n", irq);
+
+	err_ivte = cxl_p1_read(adapter, CXL_PSL_ErrIVTE);
+	dev_crit(&adapter->dev, "PSL_ErrIVTE: 0x%016llx\n", err_ivte);
+
+	dev_crit(&adapter->dev, "STOPPING CXL TRACE\n");
+	cxl_stop_trace(adapter);
+
+	fir1 = cxl_p1_read(adapter, CXL_PSL_FIR1);
+	fir2 = cxl_p1_read(adapter, CXL_PSL_FIR2);
+
+	dev_crit(&adapter->dev, "PSL_FIR1: 0x%016llx\nPSL_FIR2: 0x%016llx\n", fir1, fir2);
+
+	return IRQ_HANDLED;
+}
+
+int cxl_native_register_psl_err_irq(struct cxl *adapter)
+{
+	int rc;
+
+	adapter->irq_name = kasprintf(GFP_KERNEL, "cxl-%s-err",
+				      dev_name(&adapter->dev));
+	if (!adapter->irq_name)
+		return -ENOMEM;
+
+	if ((rc = cxl_register_one_irq(adapter, native_irq_err, adapter,
+				       &adapter->native->err_hwirq,
+				       &adapter->native->err_virq,
+				       adapter->irq_name))) {
+		kfree(adapter->irq_name);
+		adapter->irq_name = NULL;
+		return rc;
+	}
+
+	cxl_p1_write(adapter, CXL_PSL_ErrIVTE, adapter->native->err_hwirq & 0xffff);
+
+	return 0;
+}
+
+void cxl_native_release_psl_err_irq(struct cxl *adapter)
+{
+	if (adapter->native->err_virq != irq_find_mapping(NULL, adapter->native->err_hwirq))
+		return;
+
+	cxl_p1_write(adapter, CXL_PSL_ErrIVTE, 0x0000000000000000);
+	cxl_unmap_irq(adapter->native->err_virq, adapter);
+	cxl_ops->release_one_irq(adapter, adapter->native->err_hwirq);
+	kfree(adapter->irq_name);
+}
+
+int cxl_native_register_serr_irq(struct cxl_afu *afu)
+{
+	u64 serr;
+	int rc;
+
+	afu->err_irq_name = kasprintf(GFP_KERNEL, "cxl-%s-err",
+				      dev_name(&afu->dev));
+	if (!afu->err_irq_name)
+		return -ENOMEM;
+
+	if ((rc = cxl_register_one_irq(afu->adapter, native_slice_irq_err, afu,
+				       &afu->serr_hwirq,
+				       &afu->serr_virq, afu->err_irq_name))) {
+		kfree(afu->err_irq_name);
+		afu->err_irq_name = NULL;
+		return rc;
+	}
+
+	serr = cxl_p1n_read(afu, CXL_PSL_SERR_An);
+	serr = (serr & 0x00ffffffffff0000ULL) | (afu->serr_hwirq & 0xffff);
+	cxl_p1n_write(afu, CXL_PSL_SERR_An, serr);
 
 	return 0;
 }
 
+void cxl_native_release_serr_irq(struct cxl_afu *afu)
+{
+	if (afu->serr_virq != irq_find_mapping(NULL, afu->serr_hwirq))
+		return;
+
+	cxl_p1n_write(afu, CXL_PSL_SERR_An, 0x0000000000000000);
+	cxl_unmap_irq(afu->serr_virq, afu);
+	cxl_ops->release_one_irq(afu->adapter, afu->serr_hwirq);
+	kfree(afu->err_irq_name);
+}
+
+int cxl_native_register_psl_irq(struct cxl_afu *afu)
+{
+	int rc;
+
+	afu->psl_irq_name = kasprintf(GFP_KERNEL, "cxl-%s",
+				      dev_name(&afu->dev));
+	if (!afu->psl_irq_name)
+		return -ENOMEM;
+
+	if ((rc = cxl_register_one_irq(afu->adapter, native_irq_multiplexed,
+				    afu, &afu->native->psl_hwirq, &afu->native->psl_virq,
+				    afu->psl_irq_name))) {
+		kfree(afu->psl_irq_name);
+		afu->psl_irq_name = NULL;
+	}
+	return rc;
+}
+
+void cxl_native_release_psl_irq(struct cxl_afu *afu)
+{
+	if (afu->native->psl_virq != irq_find_mapping(NULL, afu->native->psl_hwirq))
+		return;
+
+	cxl_unmap_irq(afu->native->psl_virq, afu);
+	cxl_ops->release_one_irq(afu->adapter, afu->native->psl_hwirq);
+	kfree(afu->psl_irq_name);
+}
+
 static void recover_psl_err(struct cxl_afu *afu, u64 errstat)
 {
 	u64 dsisr;
@@ -769,7 +951,7 @@ static void recover_psl_err(struct cxl_afu *afu, u64 errstat)
 	cxl_p2n_write(afu, CXL_PSL_ErrStat_An, errstat);
 }
 
-int cxl_ack_irq(struct cxl_context *ctx, u64 tfc, u64 psl_reset_mask)
+static int native_ack_irq(struct cxl_context *ctx, u64 tfc, u64 psl_reset_mask)
 {
 	trace_cxl_psl_irq_ack(ctx, tfc);
 	if (tfc)
@@ -784,3 +966,132 @@ int cxl_check_error(struct cxl_afu *afu)
 {
 	return (cxl_p1n_read(afu, CXL_PSL_SCNTL_An) == ~0ULL);
 }
+
+static bool native_support_attributes(const char *attr_name,
+				      enum cxl_attrs type)
+{
+	return true;
+}
+
+static int native_afu_cr_read64(struct cxl_afu *afu, int cr, u64 off, u64 *out)
+{
+	if (unlikely(!cxl_ops->link_ok(afu->adapter, afu)))
+		return -EIO;
+	if (unlikely(off >= afu->crs_len))
+		return -ERANGE;
+	*out = in_le64(afu->native->afu_desc_mmio + afu->crs_offset +
+		(cr * afu->crs_len) + off);
+	return 0;
+}
+
+static int native_afu_cr_read32(struct cxl_afu *afu, int cr, u64 off, u32 *out)
+{
+	if (unlikely(!cxl_ops->link_ok(afu->adapter, afu)))
+		return -EIO;
+	if (unlikely(off >= afu->crs_len))
+		return -ERANGE;
+	*out = in_le32(afu->native->afu_desc_mmio + afu->crs_offset +
+		(cr * afu->crs_len) + off);
+	return 0;
+}
+
+static int native_afu_cr_read16(struct cxl_afu *afu, int cr, u64 off, u16 *out)
+{
+	u64 aligned_off = off & ~0x3L;
+	u32 val;
+	int rc;
+
+	rc = native_afu_cr_read32(afu, cr, aligned_off, &val);
+	if (!rc)
+		*out = (val >> ((off & 0x3) * 8)) & 0xffff;
+	return rc;
+}
+
+static int native_afu_cr_read8(struct cxl_afu *afu, int cr, u64 off, u8 *out)
+{
+	u64 aligned_off = off & ~0x3L;
+	u32 val;
+	int rc;
+
+	rc = native_afu_cr_read32(afu, cr, aligned_off, &val);
+	if (!rc)
+		*out = (val >> ((off & 0x3) * 8)) & 0xff;
+	return rc;
+}
+
+static int native_afu_cr_write32(struct cxl_afu *afu, int cr, u64 off, u32 in)
+{
+	if (unlikely(!cxl_ops->link_ok(afu->adapter, afu)))
+		return -EIO;
+	if (unlikely(off >= afu->crs_len))
+		return -ERANGE;
+	out_le32(afu->native->afu_desc_mmio + afu->crs_offset +
+		(cr * afu->crs_len) + off, in);
+	return 0;
+}
+
+static int native_afu_cr_write16(struct cxl_afu *afu, int cr, u64 off, u16 in)
+{
+	u64 aligned_off = off & ~0x3L;
+	u32 val32, mask, shift;
+	int rc;
+
+	rc = native_afu_cr_read32(afu, cr, aligned_off, &val32);
+	if (rc)
+		return rc;
+	shift = (off & 0x3) * 8;
+	WARN_ON(shift == 24);
+	mask = 0xffff << shift;
+	val32 = (val32 & ~mask) | (in << shift);
+
+	rc = native_afu_cr_write32(afu, cr, aligned_off, val32);
+	return rc;
+}
+
+static int native_afu_cr_write8(struct cxl_afu *afu, int cr, u64 off, u8 in)
+{
+	u64 aligned_off = off & ~0x3L;
+	u32 val32, mask, shift;
+	int rc;
+
+	rc = native_afu_cr_read32(afu, cr, aligned_off, &val32);
+	if (rc)
+		return rc;
+	shift = (off & 0x3) * 8;
+	mask = 0xff << shift;
+	val32 = (val32 & ~mask) | (in << shift);
+
+	rc = native_afu_cr_write32(afu, cr, aligned_off, val32);
+	return rc;
+}
+
+const struct cxl_backend_ops cxl_native_ops = {
+	.module = THIS_MODULE,
+	.adapter_reset = cxl_pci_reset,
+	.alloc_one_irq = cxl_pci_alloc_one_irq,
+	.release_one_irq = cxl_pci_release_one_irq,
+	.alloc_irq_ranges = cxl_pci_alloc_irq_ranges,
+	.release_irq_ranges = cxl_pci_release_irq_ranges,
+	.setup_irq = cxl_pci_setup_irq,
+	.handle_psl_slice_error = native_handle_psl_slice_error,
+	.psl_interrupt = NULL,
+	.ack_irq = native_ack_irq,
+	.attach_process = native_attach_process,
+	.detach_process = native_detach_process,
+	.support_attributes = native_support_attributes,
+	.link_ok = cxl_adapter_link_ok,
+	.release_afu = cxl_pci_release_afu,
+	.afu_read_err_buffer = cxl_pci_afu_read_err_buffer,
+	.afu_check_and_enable = native_afu_check_and_enable,
+	.afu_activate_mode = native_afu_activate_mode,
+	.afu_deactivate_mode = native_afu_deactivate_mode,
+	.afu_reset = native_afu_reset,
+	.afu_cr_read8 = native_afu_cr_read8,
+	.afu_cr_read16 = native_afu_cr_read16,
+	.afu_cr_read32 = native_afu_cr_read32,
+	.afu_cr_read64 = native_afu_cr_read64,
+	.afu_cr_write8 = native_afu_cr_write8,
+	.afu_cr_write16 = native_afu_cr_write16,
+	.afu_cr_write32 = native_afu_cr_write32,
+	.read_adapter_vpd = cxl_pci_read_adapter_vpd,
+};
diff --git a/drivers/misc/cxl/of.c b/drivers/misc/cxl/of.c
new file mode 100644
index 000000000000..edc458395f68
--- /dev/null
+++ b/drivers/misc/cxl/of.c
@@ -0,0 +1,513 @@
+/*
+ * Copyright 2015 IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <linux/of_address.h>
+#include <linux/of_platform.h>
+
+#include "cxl.h"
+
+
+static const __be32 *read_prop_string(const struct device_node *np,
+				const char *prop_name)
+{
+	const __be32 *prop;
+
+	prop = of_get_property(np, prop_name, NULL);
+	if (cxl_verbose && prop)
+		pr_info("%s: %s\n", prop_name, (char *) prop);
+	return prop;
+}
+
+static const __be32 *read_prop_dword(const struct device_node *np,
+				const char *prop_name, u32 *val)
+{
+	const __be32 *prop;
+
+	prop = of_get_property(np, prop_name, NULL);
+	if (prop)
+		*val = be32_to_cpu(prop[0]);
+	if (cxl_verbose && prop)
+		pr_info("%s: %#x (%u)\n", prop_name, *val, *val);
+	return prop;
+}
+
+static const __be64 *read_prop64_dword(const struct device_node *np,
+				const char *prop_name, u64 *val)
+{
+	const __be64 *prop;
+
+	prop = of_get_property(np, prop_name, NULL);
+	if (prop)
+		*val = be64_to_cpu(prop[0]);
+	if (cxl_verbose && prop)
+		pr_info("%s: %#llx (%llu)\n", prop_name, *val, *val);
+	return prop;
+}
+
+
+static int read_handle(struct device_node *np, u64 *handle)
+{
+	const __be32 *prop;
+	u64 size;
+
+	/* Get address and size of the node */
+	prop = of_get_address(np, 0, &size, NULL);
+	if (size)
+		return -EINVAL;
+
+	/* Helper to read a big number; size is in cells (not bytes) */
+	*handle = of_read_number(prop, of_n_addr_cells(np));
+	return 0;
+}
+
+static int read_phys_addr(struct device_node *np, char *prop_name,
+			struct cxl_afu *afu)
+{
+	int i, len, entry_size, naddr, nsize, type;
+	u64 addr, size;
+	const __be32 *prop;
+
+	naddr = of_n_addr_cells(np);
+	nsize = of_n_size_cells(np);
+
+	prop = of_get_property(np, prop_name, &len);
+	if (prop) {
+		entry_size = naddr + nsize;
+		for (i = 0; i < (len / 4); i += entry_size, prop += entry_size) {
+			type = be32_to_cpu(prop[0]);
+			addr = of_read_number(prop, naddr);
+			size = of_read_number(&prop[naddr], nsize);
+			switch (type) {
+			case 0: /* unit address */
+				afu->guest->handle = addr;
+				break;
+			case 1: /* p2 area */
+				afu->guest->p2n_phys += addr;
+				afu->guest->p2n_size = size;
+				break;
+			case 2: /* problem state area */
+				afu->psn_phys += addr;
+				afu->adapter->ps_size = size;
+				break;
+			default:
+				pr_err("Invalid address type %d found in %s property of AFU\n",
+					type, prop_name);
+				return -EINVAL;
+			}
+			if (cxl_verbose)
+				pr_info("%s: %#x %#llx (size %#llx)\n",
+					prop_name, type, addr, size);
+		}
+	}
+	return 0;
+}
+
+static int read_vpd(struct cxl *adapter, struct cxl_afu *afu)
+{
+	char vpd[256];
+	int rc;
+	size_t len = sizeof(vpd);
+
+	memset(vpd, 0, len);
+
+	if (adapter)
+		rc = cxl_guest_read_adapter_vpd(adapter, vpd, len);
+	else
+		rc = cxl_guest_read_afu_vpd(afu, vpd, len);
+
+	if (rc > 0) {
+		cxl_dump_debug_buffer(vpd, rc);
+		rc = 0;
+	}
+	return rc;
+}
+
+int cxl_of_read_afu_handle(struct cxl_afu *afu, struct device_node *afu_np)
+{
+	if (read_handle(afu_np, &afu->guest->handle))
+		return -EINVAL;
+	pr_devel("AFU handle: 0x%.16llx\n", afu->guest->handle);
+
+	return 0;
+}
+
+int cxl_of_read_afu_properties(struct cxl_afu *afu, struct device_node *np)
+{
+	int i, len, rc;
+	char *p;
+	const __be32 *prop;
+	u16 device_id, vendor_id;
+	u32 val = 0, class_code;
+
+	/* Properties are read in the same order as listed in PAPR */
+
+	if (cxl_verbose) {
+		pr_info("Dump of the 'ibm,coherent-platform-function' node properties:\n");
+
+		prop = of_get_property(np, "compatible", &len);
+		i = 0;
+		while (i < len) {
+			p = (char *) prop + i;
+			pr_info("compatible: %s\n", p);
+			i += strlen(p) + 1;
+		}
+		read_prop_string(np, "name");
+	}
+
+	rc = read_phys_addr(np, "reg", afu);
+	if (rc)
+		return rc;
+
+	rc = read_phys_addr(np, "assigned-addresses", afu);
+	if (rc)
+		return rc;
+
+	if (afu->psn_phys == 0)
+		afu->psa = false;
+	else
+		afu->psa = true;
+
+	if (cxl_verbose) {
+		read_prop_string(np, "ibm,loc-code");
+		read_prop_string(np, "device_type");
+	}
+
+	read_prop_dword(np, "ibm,#processes", &afu->max_procs_virtualised);
+
+	if (cxl_verbose) {
+		read_prop_dword(np, "ibm,scratchpad-size", &val);
+		read_prop_dword(np, "ibm,programmable", &val);
+		read_prop_string(np, "ibm,phandle");
+		read_vpd(NULL, afu);
+	}
+
+	read_prop_dword(np, "ibm,max-ints-per-process", &afu->guest->max_ints);
+	afu->irqs_max = afu->guest->max_ints;
+
+	prop = read_prop_dword(np, "ibm,min-ints-per-process", &afu->pp_irqs);
+	if (prop) {
+		/* One extra interrupt for the PSL interrupt is already
+		 * included. Remove it now to keep only AFU interrupts and
+		 * match the native case.
+		 */
+		afu->pp_irqs--;
+	}
+
+	if (cxl_verbose) {
+		read_prop_dword(np, "ibm,max-ints", &val);
+		read_prop_dword(np, "ibm,vpd-size", &val);
+	}
+
+	read_prop64_dword(np, "ibm,error-buffer-size", &afu->eb_len);
+	afu->eb_offset = 0;
+
+	if (cxl_verbose)
+		read_prop_dword(np, "ibm,config-record-type", &val);
+
+	read_prop64_dword(np, "ibm,config-record-size", &afu->crs_len);
+	afu->crs_offset = 0;
+
+	read_prop_dword(np, "ibm,#config-records", &afu->crs_num);
+
+	if (cxl_verbose) {
+		for (i = 0; i < afu->crs_num; i++) {
+			rc = cxl_ops->afu_cr_read16(afu, i, PCI_DEVICE_ID,
+						&device_id);
+			if (!rc)
+				pr_info("record %d - device-id: %#x\n",
+					i, device_id);
+			rc = cxl_ops->afu_cr_read16(afu, i, PCI_VENDOR_ID,
+						&vendor_id);
+			if (!rc)
+				pr_info("record %d - vendor-id: %#x\n",
+					i, vendor_id);
+			rc = cxl_ops->afu_cr_read32(afu, i, PCI_CLASS_REVISION,
+						&class_code);
+			if (!rc) {
+				class_code >>= 8;
+				pr_info("record %d - class-code: %#x\n",
+					i, class_code);
+			}
+		}
+
+		read_prop_dword(np, "ibm,function-number", &val);
+		read_prop_dword(np, "ibm,privileged-function", &val);
+		read_prop_dword(np, "vendor-id", &val);
+		read_prop_dword(np, "device-id", &val);
+		read_prop_dword(np, "revision-id", &val);
+		read_prop_dword(np, "class-code", &val);
+		read_prop_dword(np, "subsystem-vendor-id", &val);
+		read_prop_dword(np, "subsystem-id", &val);
+	}
+	/*
+	 * if "ibm,process-mmio" doesn't exist then per-process mmio is
+	 * not supported
+	 */
+	val = 0;
+	prop = read_prop_dword(np, "ibm,process-mmio", &val);
+	if (prop && val == 1)
+		afu->pp_psa = true;
+	else
+		afu->pp_psa = false;
+
+	if (cxl_verbose) {
+		read_prop_dword(np, "ibm,supports-aur", &val);
+		read_prop_dword(np, "ibm,supports-csrp", &val);
+		read_prop_dword(np, "ibm,supports-prr", &val);
+	}
+
+	prop = read_prop_dword(np, "ibm,function-error-interrupt", &val);
+	if (prop)
+		afu->serr_hwirq = val;
+
+	pr_devel("AFU handle: %#llx\n", afu->guest->handle);
+	pr_devel("p2n_phys: %#llx (size %#llx)\n",
+		afu->guest->p2n_phys, afu->guest->p2n_size);
+	pr_devel("psn_phys: %#llx (size %#llx)\n",
+		afu->psn_phys, afu->adapter->ps_size);
+	pr_devel("Max number of processes virtualised=%i\n",
+		afu->max_procs_virtualised);
+	pr_devel("Per-process irqs min=%i, max=%i\n", afu->pp_irqs,
+		 afu->irqs_max);
+	pr_devel("Slice error interrupt=%#lx\n", afu->serr_hwirq);
+
+	return 0;
+}
+
+static int read_adapter_irq_config(struct cxl *adapter, struct device_node *np)
+{
+	const __be32 *ranges;
+	int len, nranges, i;
+	struct irq_avail *cur;
+
+	ranges = of_get_property(np, "interrupt-ranges", &len);
+	if (ranges == NULL || len < (2 * sizeof(int)))
+		return -EINVAL;
+
+	/*
+	 * encoded array of two cells per entry, each cell encoded as
+	 * with encode-int
+	 */
+	nranges = len / (2 * sizeof(int));
+	if (nranges == 0 || (nranges * 2 * sizeof(int)) != len)
+		return -EINVAL;
+
+	adapter->guest->irq_avail = kzalloc(nranges * sizeof(struct irq_avail),
+					    GFP_KERNEL);
+	if (adapter->guest->irq_avail == NULL)
+		return -ENOMEM;
+
+	adapter->guest->irq_base_offset = be32_to_cpu(ranges[0]);
+	for (i = 0; i < nranges; i++) {
+		cur = &adapter->guest->irq_avail[i];
+		cur->offset = be32_to_cpu(ranges[i * 2]);
+		cur->range  = be32_to_cpu(ranges[i * 2 + 1]);
+		cur->bitmap = kcalloc(BITS_TO_LONGS(cur->range),
+				sizeof(*cur->bitmap), GFP_KERNEL);
+		if (cur->bitmap == NULL)
+			goto err;
+		if (cur->offset < adapter->guest->irq_base_offset)
+			adapter->guest->irq_base_offset = cur->offset;
+		if (cxl_verbose)
+			pr_info("available IRQ range: %#lx-%#lx (%lu)\n",
+				cur->offset, cur->offset + cur->range - 1,
+				cur->range);
+	}
+	adapter->guest->irq_nranges = nranges;
+	spin_lock_init(&adapter->guest->irq_alloc_lock);
+
+	return 0;
+err:
+	for (i--; i >= 0; i--) {
+		cur = &adapter->guest->irq_avail[i];
+		kfree(cur->bitmap);
+	}
+	kfree(adapter->guest->irq_avail);
+	adapter->guest->irq_avail = NULL;
+	return -ENOMEM;
+}
+
+int cxl_of_read_adapter_handle(struct cxl *adapter, struct device_node *np)
+{
+	if (read_handle(np, &adapter->guest->handle))
+		return -EINVAL;
+	pr_devel("Adapter handle: 0x%.16llx\n", adapter->guest->handle);
+
+	return 0;
+}
+
+int cxl_of_read_adapter_properties(struct cxl *adapter, struct device_node *np)
+{
+	int rc, len, naddr, i;
+	char *p;
+	const __be32 *prop;
+	u32 val = 0;
+
+	/* Properties are read in the same order as listed in PAPR */
+
+	naddr = of_n_addr_cells(np);
+
+	if (cxl_verbose) {
+		pr_info("Dump of the 'ibm,coherent-platform-facility' node properties:\n");
+
+		read_prop_dword(np, "#address-cells", &val);
+		read_prop_dword(np, "#size-cells", &val);
+
+		prop = of_get_property(np, "compatible", &len);
+		i = 0;
+		while (i < len) {
+			p = (char *) prop + i;
+			pr_info("compatible: %s\n", p);
+			i += strlen(p) + 1;
+		}
+		read_prop_string(np, "name");
+		read_prop_string(np, "model");
+
+		prop = of_get_property(np, "reg", NULL);
+		if (prop) {
+			pr_info("reg: addr:%#llx size:%#x\n",
+				of_read_number(prop, naddr),
+				be32_to_cpu(prop[naddr]));
+		}
+
+		read_prop_string(np, "ibm,loc-code");
+	}
+
+	if ((rc = read_adapter_irq_config(adapter, np)))
+		return rc;
+
+	if (cxl_verbose) {
+		read_prop_string(np, "device_type");
+		read_prop_string(np, "ibm,phandle");
+	}
+
+	prop = read_prop_dword(np, "ibm,caia-version", &val);
+	if (prop) {
+		adapter->caia_major = (val & 0xFF00) >> 8;
+		adapter->caia_minor = val & 0xFF;
+	}
+
+	prop = read_prop_dword(np, "ibm,psl-revision", &val);
+	if (prop)
+		adapter->psl_rev = val;
+
+	prop = read_prop_string(np, "status");
+	if (prop) {
+		adapter->guest->status = kasprintf(GFP_KERNEL, "%s", (char *) prop);
+		if (adapter->guest->status == NULL)
+			return -ENOMEM;
+	}
+
+	prop = read_prop_dword(np, "vendor-id", &val);
+	if (prop)
+		adapter->guest->vendor = val;
+
+	prop = read_prop_dword(np, "device-id", &val);
+	if (prop)
+		adapter->guest->device = val;
+
+	if (cxl_verbose) {
+		read_prop_dword(np, "ibm,privileged-facility", &val);
+		read_prop_dword(np, "revision-id", &val);
+		read_prop_dword(np, "class-code", &val);
+	}
+
+	prop = read_prop_dword(np, "subsystem-vendor-id", &val);
+	if (prop)
+		adapter->guest->subsystem_vendor = val;
+
+	prop = read_prop_dword(np, "subsystem-id", &val);
+	if (prop)
+		adapter->guest->subsystem = val;
+
+	if (cxl_verbose)
+		read_vpd(adapter, NULL);
+
+	return 0;
+}
+
+static int cxl_of_remove(struct platform_device *pdev)
+{
+	struct cxl *adapter;
+	int afu;
+
+	adapter = dev_get_drvdata(&pdev->dev);
+	for (afu = 0; afu < adapter->slices; afu++)
+		cxl_guest_remove_afu(adapter->afu[afu]);
+
+	cxl_guest_remove_adapter(adapter);
+	return 0;
+}
+
+static void cxl_of_shutdown(struct platform_device *pdev)
+{
+	cxl_of_remove(pdev);
+}
+
+int cxl_of_probe(struct platform_device *pdev)
+{
+	struct device_node *np = NULL;
+	struct device_node *afu_np = NULL;
+	struct cxl *adapter = NULL;
+	int ret;
+	int slice, slice_ok;
+
+	pr_devel("in %s\n", __func__);
+
+	np = pdev->dev.of_node;
+	if (np == NULL)
+		return -ENODEV;
+
+	/* init adapter */
+	adapter = cxl_guest_init_adapter(np, pdev);
+	if (IS_ERR(adapter)) {
+		dev_err(&pdev->dev, "guest_init_adapter failed: %li\n", PTR_ERR(adapter));
+		return PTR_ERR(adapter);
+	}
+
+	/* init afu */
+	slice_ok = 0;
+	for (afu_np = NULL, slice = 0; (afu_np = of_get_next_child(np, afu_np)); slice++) {
+		if ((ret = cxl_guest_init_afu(adapter, slice, afu_np)))
+			dev_err(&pdev->dev, "AFU %i failed to initialise: %i\n",
+				slice, ret);
+		else
+			slice_ok++;
+	}
+
+	if (slice_ok == 0) {
+		dev_info(&pdev->dev, "No active AFU");
+		adapter->slices = 0;
+	}
+
+	if (afu_np)
+		of_node_put(afu_np);
+	return 0;
+}
+
+static const struct of_device_id cxl_of_match[] = {
+	{ .compatible = "ibm,coherent-platform-facility",},
+	{},
+};
+MODULE_DEVICE_TABLE(of, cxl_of_match);
+
+struct platform_driver cxl_of_driver = {
+	.driver = {
+		.name = "cxl_of",
+		.of_match_table = cxl_of_match,
+		.owner = THIS_MODULE
+	},
+	.probe = cxl_of_probe,
+	.remove = cxl_of_remove,
+	.shutdown = cxl_of_shutdown,
+};
diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c
index 4c1903f781fc..6634b7a3c42b 100644
--- a/drivers/misc/cxl/pci.c
+++ b/drivers/misc/cxl/pci.c
@@ -90,8 +90,8 @@
 
 /* This works a little different than the p1/p2 register accesses to make it
  * easier to pull out individual fields */
-#define AFUD_READ(afu, off)		in_be64(afu->afu_desc_mmio + off)
-#define AFUD_READ_LE(afu, off)		in_le64(afu->afu_desc_mmio + off)
+#define AFUD_READ(afu, off)		in_be64(afu->native->afu_desc_mmio + off)
+#define AFUD_READ_LE(afu, off)		in_le64(afu->native->afu_desc_mmio + off)
 #define EXTRACT_PPC_BIT(val, bit)	(!!(val & PPC_BIT(bit)))
 #define EXTRACT_PPC_BITS(val, bs, be)	((val & PPC_BITMASK(bs, be)) >> PPC_BITLSHIFT(be))
 
@@ -116,24 +116,6 @@
 #define   AFUD_EB_LEN(val)		EXTRACT_PPC_BITS(val, 8, 63)
 #define AFUD_READ_EB_OFF(afu)		AFUD_READ(afu, 0x48)
 
-u16 cxl_afu_cr_read16(struct cxl_afu *afu, int cr, u64 off)
-{
-	u64 aligned_off = off & ~0x3L;
-	u32 val;
-
-	val = cxl_afu_cr_read32(afu, cr, aligned_off);
-	return (val >> ((off & 0x2) * 8)) & 0xffff;
-}
-
-u8 cxl_afu_cr_read8(struct cxl_afu *afu, int cr, u64 off)
-{
-	u64 aligned_off = off & ~0x3L;
-	u32 val;
-
-	val = cxl_afu_cr_read32(afu, cr, aligned_off);
-	return (val >> ((off & 0x3) * 8)) & 0xff;
-}
-
 static const struct pci_device_id cxl_pci_tbl[] = {
 	{ PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x0477), },
 	{ PCI_DEVICE(PCI_VENDOR_ID_IBM, 0x044b), },
@@ -433,8 +415,8 @@ static int init_implementation_afu_regs(struct cxl_afu *afu)
 	return 0;
 }
 
-int cxl_setup_irq(struct cxl *adapter, unsigned int hwirq,
-			 unsigned int virq)
+int cxl_pci_setup_irq(struct cxl *adapter, unsigned int hwirq,
+		unsigned int virq)
 {
 	struct pci_dev *dev = to_pci_dev(adapter->dev.parent);
 
@@ -476,28 +458,30 @@ int cxl_update_image_control(struct cxl *adapter)
 	return 0;
 }
 
-int cxl_alloc_one_irq(struct cxl *adapter)
+int cxl_pci_alloc_one_irq(struct cxl *adapter)
 {
 	struct pci_dev *dev = to_pci_dev(adapter->dev.parent);
 
 	return pnv_cxl_alloc_hwirqs(dev, 1);
 }
 
-void cxl_release_one_irq(struct cxl *adapter, int hwirq)
+void cxl_pci_release_one_irq(struct cxl *adapter, int hwirq)
 {
 	struct pci_dev *dev = to_pci_dev(adapter->dev.parent);
 
 	return pnv_cxl_release_hwirqs(dev, hwirq, 1);
 }
 
-int cxl_alloc_irq_ranges(struct cxl_irq_ranges *irqs, struct cxl *adapter, unsigned int num)
+int cxl_pci_alloc_irq_ranges(struct cxl_irq_ranges *irqs,
+			struct cxl *adapter, unsigned int num)
 {
 	struct pci_dev *dev = to_pci_dev(adapter->dev.parent);
 
 	return pnv_cxl_alloc_hwirq_ranges(irqs, dev, num);
 }
 
-void cxl_release_irq_ranges(struct cxl_irq_ranges *irqs, struct cxl *adapter)
+void cxl_pci_release_irq_ranges(struct cxl_irq_ranges *irqs,
+				struct cxl *adapter)
 {
 	struct pci_dev *dev = to_pci_dev(adapter->dev.parent);
 
@@ -558,7 +542,7 @@ static int switch_card_to_cxl(struct pci_dev *dev)
 	return 0;
 }
 
-static int cxl_map_slice_regs(struct cxl_afu *afu, struct cxl *adapter, struct pci_dev *dev)
+static int pci_map_slice_regs(struct cxl_afu *afu, struct cxl *adapter, struct pci_dev *dev)
 {
 	u64 p1n_base, p2n_base, afu_desc;
 	const u64 p1n_size = 0x100;
@@ -566,15 +550,15 @@ static int cxl_map_slice_regs(struct cxl_afu *afu, struct cxl *adapter, struct p
 
 	p1n_base = p1_base(dev) + 0x10000 + (afu->slice * p1n_size);
 	p2n_base = p2_base(dev) + (afu->slice * p2n_size);
-	afu->psn_phys = p2_base(dev) + (adapter->ps_off + (afu->slice * adapter->ps_size));
-	afu_desc = p2_base(dev) + adapter->afu_desc_off + (afu->slice * adapter->afu_desc_size);
+	afu->psn_phys = p2_base(dev) + (adapter->native->ps_off + (afu->slice * adapter->ps_size));
+	afu_desc = p2_base(dev) + adapter->native->afu_desc_off + (afu->slice * adapter->native->afu_desc_size);
 
-	if (!(afu->p1n_mmio = ioremap(p1n_base, p1n_size)))
+	if (!(afu->native->p1n_mmio = ioremap(p1n_base, p1n_size)))
 		goto err;
 	if (!(afu->p2n_mmio = ioremap(p2n_base, p2n_size)))
 		goto err1;
 	if (afu_desc) {
-		if (!(afu->afu_desc_mmio = ioremap(afu_desc, adapter->afu_desc_size)))
+		if (!(afu->native->afu_desc_mmio = ioremap(afu_desc, adapter->native->afu_desc_size)))
 			goto err2;
 	}
 
@@ -582,62 +566,41 @@ static int cxl_map_slice_regs(struct cxl_afu *afu, struct cxl *adapter, struct p
 err2:
 	iounmap(afu->p2n_mmio);
 err1:
-	iounmap(afu->p1n_mmio);
+	iounmap(afu->native->p1n_mmio);
 err:
 	dev_err(&afu->dev, "Error mapping AFU MMIO regions\n");
 	return -ENOMEM;
 }
 
-static void cxl_unmap_slice_regs(struct cxl_afu *afu)
+static void pci_unmap_slice_regs(struct cxl_afu *afu)
 {
 	if (afu->p2n_mmio) {
 		iounmap(afu->p2n_mmio);
 		afu->p2n_mmio = NULL;
 	}
-	if (afu->p1n_mmio) {
-		iounmap(afu->p1n_mmio);
-		afu->p1n_mmio = NULL;
+	if (afu->native->p1n_mmio) {
+		iounmap(afu->native->p1n_mmio);
+		afu->native->p1n_mmio = NULL;
 	}
-	if (afu->afu_desc_mmio) {
-		iounmap(afu->afu_desc_mmio);
-		afu->afu_desc_mmio = NULL;
+	if (afu->native->afu_desc_mmio) {
+		iounmap(afu->native->afu_desc_mmio);
+		afu->native->afu_desc_mmio = NULL;
 	}
 }
 
-static void cxl_release_afu(struct device *dev)
+void cxl_pci_release_afu(struct device *dev)
 {
 	struct cxl_afu *afu = to_cxl_afu(dev);
 
-	pr_devel("cxl_release_afu\n");
+	pr_devel("%s\n", __func__);
 
 	idr_destroy(&afu->contexts_idr);
 	cxl_release_spa(afu);
 
+	kfree(afu->native);
 	kfree(afu);
 }
 
-static struct cxl_afu *cxl_alloc_afu(struct cxl *adapter, int slice)
-{
-	struct cxl_afu *afu;
-
-	if (!(afu = kzalloc(sizeof(struct cxl_afu), GFP_KERNEL)))
-		return NULL;
-
-	afu->adapter = adapter;
-	afu->dev.parent = &adapter->dev;
-	afu->dev.release = cxl_release_afu;
-	afu->slice = slice;
-	idr_init(&afu->contexts_idr);
-	mutex_init(&afu->contexts_lock);
-	spin_lock_init(&afu->afu_cntl_lock);
-	mutex_init(&afu->spa_mutex);
-
-	afu->prefault_mode = CXL_PREFAULT_NONE;
-	afu->irqs_max = afu->adapter->user_irqs;
-
-	return afu;
-}
-
 /* Expects AFU struct to have recently been zeroed out */
 static int cxl_read_afu_descriptor(struct cxl_afu *afu)
 {
@@ -659,7 +622,7 @@ static int cxl_read_afu_descriptor(struct cxl_afu *afu)
 	afu->pp_size = AFUD_PPPSA_LEN(val) * 4096;
 	afu->psa = AFUD_PPPSA_PSA(val);
 	if ((afu->pp_psa = AFUD_PPPSA_PP(val)))
-		afu->pp_offset = AFUD_READ_PPPSA_OFF(afu);
+		afu->native->pp_offset = AFUD_READ_PPPSA_OFF(afu);
 
 	val = AFUD_READ_CR(afu);
 	afu->crs_len = AFUD_CR_LEN(val) * 256;
@@ -686,10 +649,11 @@ static int cxl_read_afu_descriptor(struct cxl_afu *afu)
 
 static int cxl_afu_descriptor_looks_ok(struct cxl_afu *afu)
 {
-	int i;
+	int i, rc;
+	u32 val;
 
 	if (afu->psa && afu->adapter->ps_size <
-			(afu->pp_offset + afu->pp_size*afu->max_procs_virtualised)) {
+			(afu->native->pp_offset + afu->pp_size*afu->max_procs_virtualised)) {
 		dev_err(&afu->dev, "per-process PSA can't fit inside the PSA!\n");
 		return -ENODEV;
 	}
@@ -698,7 +662,8 @@ static int cxl_afu_descriptor_looks_ok(struct cxl_afu *afu)
 		dev_warn(&afu->dev, "AFU uses < PAGE_SIZE per-process PSA!");
 
 	for (i = 0; i < afu->crs_num; i++) {
-		if ((cxl_afu_cr_read32(afu, i, 0) == 0)) {
+		rc = cxl_ops->afu_cr_read32(afu, i, 0, &val);
+		if (rc || val == 0) {
 			dev_err(&afu->dev, "ABORTING: AFU configuration record %i is invalid\n", i);
 			return -EINVAL;
 		}
@@ -719,7 +684,7 @@ static int sanitise_afu_regs(struct cxl_afu *afu)
 	reg = cxl_p2n_read(afu, CXL_AFU_Cntl_An);
 	if ((reg & CXL_AFU_Cntl_An_ES_MASK) != CXL_AFU_Cntl_An_ES_Disabled) {
 		dev_warn(&afu->dev, "WARNING: AFU was not disabled: %#016llx\n", reg);
-		if (__cxl_afu_reset(afu))
+		if (cxl_ops->afu_reset(afu))
 			return -EIO;
 		if (cxl_afu_disable(afu))
 			return -EIO;
@@ -767,13 +732,13 @@ static int sanitise_afu_regs(struct cxl_afu *afu)
  * 4/8 bytes aligned access. So in case the requested offset/count arent 8 byte
  * aligned the function uses a bounce buffer which can be max PAGE_SIZE.
  */
-ssize_t cxl_afu_read_err_buffer(struct cxl_afu *afu, char *buf,
+ssize_t cxl_pci_afu_read_err_buffer(struct cxl_afu *afu, char *buf,
 				loff_t off, size_t count)
 {
 	loff_t aligned_start, aligned_end;
 	size_t aligned_length;
 	void *tbuf;
-	const void __iomem *ebuf = afu->afu_desc_mmio + afu->eb_offset;
+	const void __iomem *ebuf = afu->native->afu_desc_mmio + afu->eb_offset;
 
 	if (count == 0 || off < 0 || (size_t)off >= afu->eb_len)
 		return 0;
@@ -804,18 +769,18 @@ ssize_t cxl_afu_read_err_buffer(struct cxl_afu *afu, char *buf,
 	return count;
 }
 
-static int cxl_configure_afu(struct cxl_afu *afu, struct cxl *adapter, struct pci_dev *dev)
+static int pci_configure_afu(struct cxl_afu *afu, struct cxl *adapter, struct pci_dev *dev)
 {
 	int rc;
 
-	if ((rc = cxl_map_slice_regs(afu, adapter, dev)))
+	if ((rc = pci_map_slice_regs(afu, adapter, dev)))
 		return rc;
 
 	if ((rc = sanitise_afu_regs(afu)))
 		goto err1;
 
 	/* We need to reset the AFU before we can read the AFU descriptor */
-	if ((rc = __cxl_afu_reset(afu)))
+	if ((rc = cxl_ops->afu_reset(afu)))
 		goto err1;
 
 	if (cxl_verbose)
@@ -830,44 +795,50 @@ static int cxl_configure_afu(struct cxl_afu *afu, struct cxl *adapter, struct pc
 	if ((rc = init_implementation_afu_regs(afu)))
 		goto err1;
 
-	if ((rc = cxl_register_serr_irq(afu)))
+	if ((rc = cxl_native_register_serr_irq(afu)))
 		goto err1;
 
-	if ((rc = cxl_register_psl_irq(afu)))
+	if ((rc = cxl_native_register_psl_irq(afu)))
 		goto err2;
 
 	return 0;
 
 err2:
-	cxl_release_serr_irq(afu);
+	cxl_native_release_serr_irq(afu);
 err1:
-	cxl_unmap_slice_regs(afu);
+	pci_unmap_slice_regs(afu);
 	return rc;
 }
 
-static void cxl_deconfigure_afu(struct cxl_afu *afu)
+static void pci_deconfigure_afu(struct cxl_afu *afu)
 {
-	cxl_release_psl_irq(afu);
-	cxl_release_serr_irq(afu);
-	cxl_unmap_slice_regs(afu);
+	cxl_native_release_psl_irq(afu);
+	cxl_native_release_serr_irq(afu);
+	pci_unmap_slice_regs(afu);
 }
 
-static int cxl_init_afu(struct cxl *adapter, int slice, struct pci_dev *dev)
+static int pci_init_afu(struct cxl *adapter, int slice, struct pci_dev *dev)
 {
 	struct cxl_afu *afu;
-	int rc;
+	int rc = -ENOMEM;
 
 	afu = cxl_alloc_afu(adapter, slice);
 	if (!afu)
 		return -ENOMEM;
 
+	afu->native = kzalloc(sizeof(struct cxl_afu_native), GFP_KERNEL);
+	if (!afu->native)
+		goto err_free_afu;
+
+	mutex_init(&afu->native->spa_mutex);
+
 	rc = dev_set_name(&afu->dev, "afu%i.%i", adapter->adapter_num, slice);
 	if (rc)
-		goto err_free;
+		goto err_free_native;
 
-	rc = cxl_configure_afu(afu, adapter, dev);
+	rc = pci_configure_afu(afu, adapter, dev);
 	if (rc)
-		goto err_free;
+		goto err_free_native;
 
 	/* Don't care if this fails */
 	cxl_debugfs_afu_add(afu);
@@ -890,24 +861,27 @@ static int cxl_init_afu(struct cxl *adapter, int slice, struct pci_dev *dev)
 	return 0;
 
 err_put1:
-	cxl_deconfigure_afu(afu);
+	pci_deconfigure_afu(afu);
 	cxl_debugfs_afu_remove(afu);
 	device_unregister(&afu->dev);
 	return rc;
 
-err_free:
+err_free_native:
+	kfree(afu->native);
+err_free_afu:
 	kfree(afu);
 	return rc;
 
 }
 
-static void cxl_remove_afu(struct cxl_afu *afu)
+static void cxl_pci_remove_afu(struct cxl_afu *afu)
 {
-	pr_devel("cxl_remove_afu\n");
+	pr_devel("%s\n", __func__);
 
 	if (!afu)
 		return;
 
+	cxl_pci_vphb_remove(afu);
 	cxl_sysfs_afu_remove(afu);
 	cxl_debugfs_afu_remove(afu);
 
@@ -916,13 +890,13 @@ static void cxl_remove_afu(struct cxl_afu *afu)
 	spin_unlock(&afu->adapter->afu_list_lock);
 
 	cxl_context_detach_all(afu);
-	cxl_afu_deactivate_mode(afu);
+	cxl_ops->afu_deactivate_mode(afu, afu->current_mode);
 
-	cxl_deconfigure_afu(afu);
+	pci_deconfigure_afu(afu);
 	device_unregister(&afu->dev);
 }
 
-int cxl_reset(struct cxl *adapter)
+int cxl_pci_reset(struct cxl *adapter)
 {
 	struct pci_dev *dev = to_pci_dev(adapter->dev.parent);
 	int rc;
@@ -956,17 +930,17 @@ static int cxl_map_adapter_regs(struct cxl *adapter, struct pci_dev *dev)
 	pr_devel("cxl_map_adapter_regs: p1: %#016llx %#llx, p2: %#016llx %#llx",
 			p1_base(dev), p1_size(dev), p2_base(dev), p2_size(dev));
 
-	if (!(adapter->p1_mmio = ioremap(p1_base(dev), p1_size(dev))))
+	if (!(adapter->native->p1_mmio = ioremap(p1_base(dev), p1_size(dev))))
 		goto err3;
 
-	if (!(adapter->p2_mmio = ioremap(p2_base(dev), p2_size(dev))))
+	if (!(adapter->native->p2_mmio = ioremap(p2_base(dev), p2_size(dev))))
 		goto err4;
 
 	return 0;
 
 err4:
-	iounmap(adapter->p1_mmio);
-	adapter->p1_mmio = NULL;
+	iounmap(adapter->native->p1_mmio);
+	adapter->native->p1_mmio = NULL;
 err3:
 	pci_release_region(dev, 0);
 err2:
@@ -977,14 +951,14 @@ err1:
 
 static void cxl_unmap_adapter_regs(struct cxl *adapter)
 {
-	if (adapter->p1_mmio) {
-		iounmap(adapter->p1_mmio);
-		adapter->p1_mmio = NULL;
+	if (adapter->native->p1_mmio) {
+		iounmap(adapter->native->p1_mmio);
+		adapter->native->p1_mmio = NULL;
 		pci_release_region(to_pci_dev(adapter->dev.parent), 2);
 	}
-	if (adapter->p2_mmio) {
-		iounmap(adapter->p2_mmio);
-		adapter->p2_mmio = NULL;
+	if (adapter->native->p2_mmio) {
+		iounmap(adapter->native->p2_mmio);
+		adapter->native->p2_mmio = NULL;
 		pci_release_region(to_pci_dev(adapter->dev.parent), 0);
 	}
 }
@@ -1025,10 +999,10 @@ static int cxl_read_vsec(struct cxl *adapter, struct pci_dev *dev)
 
 	/* Convert everything to bytes, because there is NO WAY I'd look at the
 	 * code a month later and forget what units these are in ;-) */
-	adapter->ps_off = ps_off * 64 * 1024;
+	adapter->native->ps_off = ps_off * 64 * 1024;
 	adapter->ps_size = ps_size * 64 * 1024;
-	adapter->afu_desc_off = afu_desc_off * 64 * 1024;
-	adapter->afu_desc_size = afu_desc_size *64 * 1024;
+	adapter->native->afu_desc_off = afu_desc_off * 64 * 1024;
+	adapter->native->afu_desc_size = afu_desc_size * 64 * 1024;
 
 	/* Total IRQs - 1 PSL ERROR - #AFU*(1 slice error + 1 DSI) */
 	adapter->user_irqs = pnv_cxl_get_irq_count(dev) - 1 - 2*adapter->slices;
@@ -1079,21 +1053,26 @@ static int cxl_vsec_looks_ok(struct cxl *adapter, struct pci_dev *dev)
 		return -EINVAL;
 	}
 
-	if (!adapter->afu_desc_off || !adapter->afu_desc_size) {
+	if (!adapter->native->afu_desc_off || !adapter->native->afu_desc_size) {
 		dev_err(&dev->dev, "ABORTING: VSEC shows no AFU descriptors\n");
 		return -EINVAL;
 	}
 
-	if (adapter->ps_size > p2_size(dev) - adapter->ps_off) {
+	if (adapter->ps_size > p2_size(dev) - adapter->native->ps_off) {
 		dev_err(&dev->dev, "ABORTING: Problem state size larger than "
 				   "available in BAR2: 0x%llx > 0x%llx\n",
-			 adapter->ps_size, p2_size(dev) - adapter->ps_off);
+			 adapter->ps_size, p2_size(dev) - adapter->native->ps_off);
 		return -EINVAL;
 	}
 
 	return 0;
 }
 
+ssize_t cxl_pci_read_adapter_vpd(struct cxl *adapter, void *buf, size_t len)
+{
+	return pci_read_vpd(to_pci_dev(adapter->dev.parent), 0, len, buf);
+}
+
 static void cxl_release_adapter(struct device *dev)
 {
 	struct cxl *adapter = to_cxl_adapter(dev);
@@ -1102,33 +1081,10 @@ static void cxl_release_adapter(struct device *dev)
 
 	cxl_remove_adapter_nr(adapter);
 
+	kfree(adapter->native);
 	kfree(adapter);
 }
 
-static struct cxl *cxl_alloc_adapter(void)
-{
-	struct cxl *adapter;
-
-	if (!(adapter = kzalloc(sizeof(struct cxl), GFP_KERNEL)))
-		return NULL;
-
-	spin_lock_init(&adapter->afu_list_lock);
-
-	if (cxl_alloc_adapter_nr(adapter))
-		goto err1;
-
-	if (dev_set_name(&adapter->dev, "card%i", adapter->adapter_num))
-		goto err2;
-
-	return adapter;
-
-err2:
-	cxl_remove_adapter_nr(adapter);
-err1:
-	kfree(adapter);
-	return NULL;
-}
-
 #define CXL_PSL_ErrIVTE_tberror (0x1ull << (63-31))
 
 static int sanitise_adapter_regs(struct cxl *adapter)
@@ -1192,7 +1148,7 @@ static int cxl_configure_adapter(struct cxl *adapter, struct pci_dev *dev)
 	if ((rc = cxl_setup_psl_timebase(adapter, dev)))
 		goto err;
 
-	if ((rc = cxl_register_psl_err_irq(adapter)))
+	if ((rc = cxl_native_register_psl_err_irq(adapter)))
 		goto err;
 
 	return 0;
@@ -1207,13 +1163,13 @@ static void cxl_deconfigure_adapter(struct cxl *adapter)
 {
 	struct pci_dev *pdev = to_pci_dev(adapter->dev.parent);
 
-	cxl_release_psl_err_irq(adapter);
+	cxl_native_release_psl_err_irq(adapter);
 	cxl_unmap_adapter_regs(adapter);
 
 	pci_disable_device(pdev);
 }
 
-static struct cxl *cxl_init_adapter(struct pci_dev *dev)
+static struct cxl *cxl_pci_init_adapter(struct pci_dev *dev)
 {
 	struct cxl *adapter;
 	int rc;
@@ -1222,6 +1178,12 @@ static struct cxl *cxl_init_adapter(struct pci_dev *dev)
 	if (!adapter)
 		return ERR_PTR(-ENOMEM);
 
+	adapter->native = kzalloc(sizeof(struct cxl_native), GFP_KERNEL);
+	if (!adapter->native) {
+		rc = -ENOMEM;
+		goto err_release;
+	}
+
 	/* Set defaults for parameters which need to persist over
 	 * configure/reconfigure
 	 */
@@ -1231,8 +1193,7 @@ static struct cxl *cxl_init_adapter(struct pci_dev *dev)
 	rc = cxl_configure_adapter(adapter, dev);
 	if (rc) {
 		pci_disable_device(dev);
-		cxl_release_adapter(&adapter->dev);
-		return ERR_PTR(rc);
+		goto err_release;
 	}
 
 	/* Don't care if this one fails: */
@@ -1258,9 +1219,13 @@ err_put1:
 	cxl_deconfigure_adapter(adapter);
 	device_unregister(&adapter->dev);
 	return ERR_PTR(rc);
+
+err_release:
+	cxl_release_adapter(&adapter->dev);
+	return ERR_PTR(rc);
 }
 
-static void cxl_remove_adapter(struct cxl *adapter)
+static void cxl_pci_remove_adapter(struct cxl *adapter)
 {
 	pr_devel("cxl_remove_adapter\n");
 
@@ -1278,17 +1243,22 @@ static int cxl_probe(struct pci_dev *dev, const struct pci_device_id *id)
 	int slice;
 	int rc;
 
+	if (cxl_pci_is_vphb_device(dev)) {
+		dev_dbg(&dev->dev, "cxl_init_adapter: Ignoring cxl vphb device\n");
+		return -ENODEV;
+	}
+
 	if (cxl_verbose)
 		dump_cxl_config_space(dev);
 
-	adapter = cxl_init_adapter(dev);
+	adapter = cxl_pci_init_adapter(dev);
 	if (IS_ERR(adapter)) {
 		dev_err(&dev->dev, "cxl_init_adapter failed: %li\n", PTR_ERR(adapter));
 		return PTR_ERR(adapter);
 	}
 
 	for (slice = 0; slice < adapter->slices; slice++) {
-		if ((rc = cxl_init_afu(adapter, slice, dev))) {
+		if ((rc = pci_init_afu(adapter, slice, dev))) {
 			dev_err(&dev->dev, "AFU %i failed to initialise: %i\n", slice, rc);
 			continue;
 		}
@@ -1313,10 +1283,9 @@ static void cxl_remove(struct pci_dev *dev)
 	 */
 	for (i = 0; i < adapter->slices; i++) {
 		afu = adapter->afu[i];
-		cxl_pci_vphb_remove(afu);
-		cxl_remove_afu(afu);
+		cxl_pci_remove_afu(afu);
 	}
-	cxl_remove_adapter(adapter);
+	cxl_pci_remove_adapter(adapter);
 }
 
 static pci_ers_result_t cxl_vphb_error_detected(struct cxl_afu *afu,
@@ -1462,8 +1431,8 @@ static pci_ers_result_t cxl_pci_error_detected(struct pci_dev *pdev,
 			return result;
 
 		cxl_context_detach_all(afu);
-		cxl_afu_deactivate_mode(afu);
-		cxl_deconfigure_afu(afu);
+		cxl_ops->afu_deactivate_mode(afu, afu->current_mode);
+		pci_deconfigure_afu(afu);
 	}
 	cxl_deconfigure_adapter(adapter);
 
@@ -1486,14 +1455,12 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev)
 	for (i = 0; i < adapter->slices; i++) {
 		afu = adapter->afu[i];
 
-		if (cxl_configure_afu(afu, adapter, pdev))
+		if (pci_configure_afu(afu, adapter, pdev))
 			goto err;
 
 		if (cxl_afu_select_best_mode(afu))
 			goto err;
 
-		cxl_pci_vphb_reconfigure(afu);
-
 		list_for_each_entry(afu_dev, &afu->phb->bus->devices, bus_list) {
 			/* Reset the device context.
 			 * TODO: make this less disruptive
@@ -1509,7 +1476,7 @@ static pci_ers_result_t cxl_pci_slot_reset(struct pci_dev *pdev)
 
 			afu_dev->dev.archdata.cxl_ctx = ctx;
 
-			if (cxl_afu_check_and_enable(afu))
+			if (cxl_ops->afu_check_and_enable(afu))
 				goto err;
 
 			afu_dev->error_state = pci_channel_io_normal;
diff --git a/drivers/misc/cxl/sysfs.c b/drivers/misc/cxl/sysfs.c
index 02006f7109a8..907c99b9f5af 100644
--- a/drivers/misc/cxl/sysfs.c
+++ b/drivers/misc/cxl/sysfs.c
@@ -69,7 +69,7 @@ static ssize_t reset_adapter_store(struct device *device,
 	if ((rc != 1) || (val != 1))
 		return -EINVAL;
 
-	if ((rc = cxl_reset(adapter)))
+	if ((rc = cxl_ops->adapter_reset(adapter)))
 		return rc;
 	return count;
 }
@@ -165,7 +165,7 @@ static ssize_t pp_mmio_off_show(struct device *device,
 {
 	struct cxl_afu *afu = to_afu_chardev_m(device);
 
-	return scnprintf(buf, PAGE_SIZE, "%llu\n", afu->pp_offset);
+	return scnprintf(buf, PAGE_SIZE, "%llu\n", afu->native->pp_offset);
 }
 
 static ssize_t pp_mmio_len_show(struct device *device,
@@ -211,7 +211,7 @@ static ssize_t reset_store_afu(struct device *device,
 		goto err;
 	}
 
-	if ((rc = __cxl_afu_reset(afu)))
+	if ((rc = cxl_ops->afu_reset(afu)))
 		goto err;
 
 	rc = count;
@@ -253,8 +253,14 @@ static ssize_t irqs_max_store(struct device *device,
 	if (irqs_max < afu->pp_irqs)
 		return -EINVAL;
 
-	if (irqs_max > afu->adapter->user_irqs)
-		return -EINVAL;
+	if (cpu_has_feature(CPU_FTR_HVMODE)) {
+		if (irqs_max > afu->adapter->user_irqs)
+			return -EINVAL;
+	} else {
+		/* pHyp sets a per-AFU limit */
+		if (irqs_max > afu->guest->max_ints)
+			return -EINVAL;
+	}
 
 	afu->irqs_max = irqs_max;
 	return count;
@@ -348,7 +354,7 @@ static ssize_t mode_store(struct device *device, struct device_attribute *attr,
 	}
 
 	/*
-	 * cxl_afu_deactivate_mode needs to be done outside the lock, prevent
+	 * afu_deactivate_mode needs to be done outside the lock, prevent
 	 * other contexts coming in before we are ready:
 	 */
 	old_mode = afu->current_mode;
@@ -357,9 +363,9 @@ static ssize_t mode_store(struct device *device, struct device_attribute *attr,
 
 	mutex_unlock(&afu->contexts_lock);
 
-	if ((rc = _cxl_afu_deactivate_mode(afu, old_mode)))
+	if ((rc = cxl_ops->afu_deactivate_mode(afu, old_mode)))
 		return rc;
-	if ((rc = cxl_afu_activate_mode(afu, mode)))
+	if ((rc = cxl_ops->afu_activate_mode(afu, mode)))
 		return rc;
 
 	return count;
@@ -389,7 +395,7 @@ static ssize_t afu_eb_read(struct file *filp, struct kobject *kobj,
 	struct cxl_afu *afu = to_cxl_afu(container_of(kobj,
 						      struct device, kobj));
 
-	return cxl_afu_read_err_buffer(afu, buf, off, count);
+	return cxl_ops->afu_read_err_buffer(afu, buf, off, count);
 }
 
 static struct device_attribute afu_attrs[] = {
@@ -406,24 +412,39 @@ static struct device_attribute afu_attrs[] = {
 
 int cxl_sysfs_adapter_add(struct cxl *adapter)
 {
+	struct device_attribute *dev_attr;
 	int i, rc;
 
 	for (i = 0; i < ARRAY_SIZE(adapter_attrs); i++) {
-		if ((rc = device_create_file(&adapter->dev, &adapter_attrs[i])))
-			goto err;
+		dev_attr = &adapter_attrs[i];
+		if (cxl_ops->support_attributes(dev_attr->attr.name,
+						CXL_ADAPTER_ATTRS)) {
+			if ((rc = device_create_file(&adapter->dev, dev_attr)))
+				goto err;
+		}
 	}
 	return 0;
 err:
-	for (i--; i >= 0; i--)
-		device_remove_file(&adapter->dev, &adapter_attrs[i]);
+	for (i--; i >= 0; i--) {
+		dev_attr = &adapter_attrs[i];
+		if (cxl_ops->support_attributes(dev_attr->attr.name,
+						CXL_ADAPTER_ATTRS))
+			device_remove_file(&adapter->dev, dev_attr);
+	}
 	return rc;
 }
+
 void cxl_sysfs_adapter_remove(struct cxl *adapter)
 {
+	struct device_attribute *dev_attr;
 	int i;
 
-	for (i = 0; i < ARRAY_SIZE(adapter_attrs); i++)
-		device_remove_file(&adapter->dev, &adapter_attrs[i]);
+	for (i = 0; i < ARRAY_SIZE(adapter_attrs); i++) {
+		dev_attr = &adapter_attrs[i];
+		if (cxl_ops->support_attributes(dev_attr->attr.name,
+						CXL_ADAPTER_ATTRS))
+			device_remove_file(&adapter->dev, dev_attr);
+	}
 }
 
 struct afu_config_record {
@@ -469,10 +490,12 @@ static ssize_t afu_read_config(struct file *filp, struct kobject *kobj,
 	struct afu_config_record *cr = to_cr(kobj);
 	struct cxl_afu *afu = to_cxl_afu(container_of(kobj->parent, struct device, kobj));
 
-	u64 i, j, val;
+	u64 i, j, val, rc;
 
 	for (i = 0; i < count;) {
-		val = cxl_afu_cr_read64(afu, cr->cr, off & ~0x7);
+		rc = cxl_ops->afu_cr_read64(afu, cr->cr, off & ~0x7, &val);
+		if (rc)
+			val = ~0ULL;
 		for (j = off & 0x7; j < 8 && i < count; i++, j++, off++)
 			buf[i] = (val >> (j * 8)) & 0xff;
 	}
@@ -517,14 +540,22 @@ static struct afu_config_record *cxl_sysfs_afu_new_cr(struct cxl_afu *afu, int c
 		return ERR_PTR(-ENOMEM);
 
 	cr->cr = cr_idx;
-	cr->device = cxl_afu_cr_read16(afu, cr_idx, PCI_DEVICE_ID);
-	cr->vendor = cxl_afu_cr_read16(afu, cr_idx, PCI_VENDOR_ID);
-	cr->class = cxl_afu_cr_read32(afu, cr_idx, PCI_CLASS_REVISION) >> 8;
+
+	rc = cxl_ops->afu_cr_read16(afu, cr_idx, PCI_DEVICE_ID, &cr->device);
+	if (rc)
+		goto err;
+	rc = cxl_ops->afu_cr_read16(afu, cr_idx, PCI_VENDOR_ID, &cr->vendor);
+	if (rc)
+		goto err;
+	rc = cxl_ops->afu_cr_read32(afu, cr_idx, PCI_CLASS_REVISION, &cr->class);
+	if (rc)
+		goto err;
+	cr->class >>= 8;
 
 	/*
 	 * Export raw AFU PCIe like config record. For now this is read only by
 	 * root - we can expand that later to be readable by non-root and maybe
-	 * even writable provided we have a good use-case. Once we suport
+	 * even writable provided we have a good use-case. Once we support
 	 * exposing AFUs through a virtual PHB they will get that for free from
 	 * Linux' PCI infrastructure, but until then it's not clear that we
 	 * need it for anything since the main use case is just identifying
@@ -562,6 +593,7 @@ err:
 
 void cxl_sysfs_afu_remove(struct cxl_afu *afu)
 {
+	struct device_attribute *dev_attr;
 	struct afu_config_record *cr, *tmp;
 	int i;
 
@@ -569,8 +601,12 @@ void cxl_sysfs_afu_remove(struct cxl_afu *afu)
 	if (afu->eb_len)
 		device_remove_bin_file(&afu->dev, &afu->attr_eb);
 
-	for (i = 0; i < ARRAY_SIZE(afu_attrs); i++)
-		device_remove_file(&afu->dev, &afu_attrs[i]);
+	for (i = 0; i < ARRAY_SIZE(afu_attrs); i++) {
+		dev_attr = &afu_attrs[i];
+		if (cxl_ops->support_attributes(dev_attr->attr.name,
+						CXL_AFU_ATTRS))
+			device_remove_file(&afu->dev, &afu_attrs[i]);
+	}
 
 	list_for_each_entry_safe(cr, tmp, &afu->crs, list) {
 		sysfs_remove_bin_file(&cr->kobj, &cr->config_attr);
@@ -580,14 +616,19 @@ void cxl_sysfs_afu_remove(struct cxl_afu *afu)
 
 int cxl_sysfs_afu_add(struct cxl_afu *afu)
 {
+	struct device_attribute *dev_attr;
 	struct afu_config_record *cr;
 	int i, rc;
 
 	INIT_LIST_HEAD(&afu->crs);
 
 	for (i = 0; i < ARRAY_SIZE(afu_attrs); i++) {
-		if ((rc = device_create_file(&afu->dev, &afu_attrs[i])))
-			goto err;
+		dev_attr = &afu_attrs[i];
+		if (cxl_ops->support_attributes(dev_attr->attr.name,
+						CXL_AFU_ATTRS)) {
+			if ((rc = device_create_file(&afu->dev, &afu_attrs[i])))
+				goto err;
+		}
 	}
 
 	/* conditionally create the add the binary file for error info buffer */
@@ -626,32 +667,50 @@ err:
 	/* reset the eb_len as we havent created the bin attr */
 	afu->eb_len = 0;
 
-	for (i--; i >= 0; i--)
+	for (i--; i >= 0; i--) {
+		dev_attr = &afu_attrs[i];
+		if (cxl_ops->support_attributes(dev_attr->attr.name,
+						CXL_AFU_ATTRS))
 		device_remove_file(&afu->dev, &afu_attrs[i]);
+	}
 	return rc;
 }
 
 int cxl_sysfs_afu_m_add(struct cxl_afu *afu)
 {
+	struct device_attribute *dev_attr;
 	int i, rc;
 
 	for (i = 0; i < ARRAY_SIZE(afu_master_attrs); i++) {
-		if ((rc = device_create_file(afu->chardev_m, &afu_master_attrs[i])))
-			goto err;
+		dev_attr = &afu_master_attrs[i];
+		if (cxl_ops->support_attributes(dev_attr->attr.name,
+						CXL_AFU_MASTER_ATTRS)) {
+			if ((rc = device_create_file(afu->chardev_m, &afu_master_attrs[i])))
+				goto err;
+		}
 	}
 
 	return 0;
 
 err:
-	for (i--; i >= 0; i--)
-		device_remove_file(afu->chardev_m, &afu_master_attrs[i]);
+	for (i--; i >= 0; i--) {
+		dev_attr = &afu_master_attrs[i];
+		if (cxl_ops->support_attributes(dev_attr->attr.name,
+						CXL_AFU_MASTER_ATTRS))
+			device_remove_file(afu->chardev_m, &afu_master_attrs[i]);
+	}
 	return rc;
 }
 
 void cxl_sysfs_afu_m_remove(struct cxl_afu *afu)
 {
+	struct device_attribute *dev_attr;
 	int i;
 
-	for (i = 0; i < ARRAY_SIZE(afu_master_attrs); i++)
-		device_remove_file(afu->chardev_m, &afu_master_attrs[i]);
+	for (i = 0; i < ARRAY_SIZE(afu_master_attrs); i++) {
+		dev_attr = &afu_master_attrs[i];
+		if (cxl_ops->support_attributes(dev_attr->attr.name,
+						CXL_AFU_MASTER_ATTRS))
+			device_remove_file(afu->chardev_m, &afu_master_attrs[i]);
+	}
 }
diff --git a/drivers/misc/cxl/trace.h b/drivers/misc/cxl/trace.h
index 6e1e2adfba8e..751d6119683e 100644
--- a/drivers/misc/cxl/trace.h
+++ b/drivers/misc/cxl/trace.h
@@ -450,6 +450,199 @@ DEFINE_EVENT(cxl_pe_class, cxl_slbia,
 	TP_ARGS(ctx)
 );
 
+TRACE_EVENT(cxl_hcall,
+	TP_PROTO(u64 unit_address, u64 process_token, long rc),
+
+	TP_ARGS(unit_address, process_token, rc),
+
+	TP_STRUCT__entry(
+		__field(u64, unit_address)
+		__field(u64, process_token)
+		__field(long, rc)
+	),
+
+	TP_fast_assign(
+		__entry->unit_address = unit_address;
+		__entry->process_token = process_token;
+		__entry->rc = rc;
+	),
+
+	TP_printk("unit_address=0x%016llx process_token=0x%016llx rc=%li",
+		__entry->unit_address,
+		__entry->process_token,
+		__entry->rc
+	)
+);
+
+TRACE_EVENT(cxl_hcall_control,
+	TP_PROTO(u64 unit_address, char *fct, u64 p1, u64 p2, u64 p3,
+	u64 p4, unsigned long r4, long rc),
+
+	TP_ARGS(unit_address, fct, p1, p2, p3, p4, r4, rc),
+
+	TP_STRUCT__entry(
+		__field(u64, unit_address)
+		__field(char *, fct)
+		__field(u64, p1)
+		__field(u64, p2)
+		__field(u64, p3)
+		__field(u64, p4)
+		__field(unsigned long, r4)
+		__field(long, rc)
+	),
+
+	TP_fast_assign(
+		__entry->unit_address = unit_address;
+		__entry->fct = fct;
+		__entry->p1 = p1;
+		__entry->p2 = p2;
+		__entry->p3 = p3;
+		__entry->p4 = p4;
+		__entry->r4 = r4;
+		__entry->rc = rc;
+	),
+
+	TP_printk("unit_address=%#.16llx %s(%#llx, %#llx, %#llx, %#llx, R4: %#lx)): %li",
+		__entry->unit_address,
+		__entry->fct,
+		__entry->p1,
+		__entry->p2,
+		__entry->p3,
+		__entry->p4,
+		__entry->r4,
+		__entry->rc
+	)
+);
+
+TRACE_EVENT(cxl_hcall_attach,
+	TP_PROTO(u64 unit_address, u64 phys_addr, unsigned long process_token,
+		unsigned long mmio_addr, unsigned long mmio_size, long rc),
+
+	TP_ARGS(unit_address, phys_addr, process_token,
+		mmio_addr, mmio_size, rc),
+
+	TP_STRUCT__entry(
+		__field(u64, unit_address)
+		__field(u64, phys_addr)
+		__field(unsigned long, process_token)
+		__field(unsigned long, mmio_addr)
+		__field(unsigned long, mmio_size)
+		__field(long, rc)
+	),
+
+	TP_fast_assign(
+		__entry->unit_address = unit_address;
+		__entry->phys_addr = phys_addr;
+		__entry->process_token = process_token;
+		__entry->mmio_addr = mmio_addr;
+		__entry->mmio_size = mmio_size;
+		__entry->rc = rc;
+	),
+
+	TP_printk("unit_address=0x%016llx phys_addr=0x%016llx "
+		"token=0x%.8lx mmio_addr=0x%lx mmio_size=0x%lx rc=%li",
+		__entry->unit_address,
+		__entry->phys_addr,
+		__entry->process_token,
+		__entry->mmio_addr,
+		__entry->mmio_size,
+		__entry->rc
+	)
+);
+
+DEFINE_EVENT(cxl_hcall, cxl_hcall_detach,
+	TP_PROTO(u64 unit_address, u64 process_token, long rc),
+	TP_ARGS(unit_address, process_token, rc)
+);
+
+DEFINE_EVENT(cxl_hcall_control, cxl_hcall_control_function,
+	TP_PROTO(u64 unit_address, char *fct, u64 p1, u64 p2, u64 p3,
+	u64 p4, unsigned long r4, long rc),
+	TP_ARGS(unit_address, fct, p1, p2, p3, p4, r4, rc)
+);
+
+DEFINE_EVENT(cxl_hcall, cxl_hcall_collect_int_info,
+	TP_PROTO(u64 unit_address, u64 process_token, long rc),
+	TP_ARGS(unit_address, process_token, rc)
+);
+
+TRACE_EVENT(cxl_hcall_control_faults,
+	TP_PROTO(u64 unit_address, u64 process_token,
+		u64 control_mask, u64 reset_mask, unsigned long r4,
+		long rc),
+
+	TP_ARGS(unit_address, process_token,
+		control_mask, reset_mask, r4, rc),
+
+	TP_STRUCT__entry(
+		__field(u64, unit_address)
+		__field(u64, process_token)
+		__field(u64, control_mask)
+		__field(u64, reset_mask)
+		__field(unsigned long, r4)
+		__field(long, rc)
+	),
+
+	TP_fast_assign(
+		__entry->unit_address = unit_address;
+		__entry->process_token = process_token;
+		__entry->control_mask = control_mask;
+		__entry->reset_mask = reset_mask;
+		__entry->r4 = r4;
+		__entry->rc = rc;
+	),
+
+	TP_printk("unit_address=0x%016llx process_token=0x%llx "
+		"control_mask=%#llx reset_mask=%#llx r4=%#lx rc=%li",
+		__entry->unit_address,
+		__entry->process_token,
+		__entry->control_mask,
+		__entry->reset_mask,
+		__entry->r4,
+		__entry->rc
+	)
+);
+
+DEFINE_EVENT(cxl_hcall_control, cxl_hcall_control_facility,
+	TP_PROTO(u64 unit_address, char *fct, u64 p1, u64 p2, u64 p3,
+	u64 p4, unsigned long r4, long rc),
+	TP_ARGS(unit_address, fct, p1, p2, p3, p4, r4, rc)
+);
+
+TRACE_EVENT(cxl_hcall_download_facility,
+	TP_PROTO(u64 unit_address, char *fct, u64 list_address, u64 num,
+	unsigned long r4, long rc),
+
+	TP_ARGS(unit_address, fct, list_address, num, r4, rc),
+
+	TP_STRUCT__entry(
+		__field(u64, unit_address)
+		__field(char *, fct)
+		__field(u64, list_address)
+		__field(u64, num)
+		__field(unsigned long, r4)
+		__field(long, rc)
+	),
+
+	TP_fast_assign(
+		__entry->unit_address = unit_address;
+		__entry->fct = fct;
+		__entry->list_address = list_address;
+		__entry->num = num;
+		__entry->r4 = r4;
+		__entry->rc = rc;
+	),
+
+	TP_printk("%#.16llx, %s(%#llx, %#llx), %#lx): %li",
+		__entry->unit_address,
+		__entry->fct,
+		__entry->list_address,
+		__entry->num,
+		__entry->r4,
+		__entry->rc
+	)
+);
+
 #endif /* _CXL_TRACE_H */
 
 /* This part must be outside protection */
diff --git a/drivers/misc/cxl/vphb.c b/drivers/misc/cxl/vphb.c
index cbd4331fb45c..cdc7723b845d 100644
--- a/drivers/misc/cxl/vphb.c
+++ b/drivers/misc/cxl/vphb.c
@@ -49,7 +49,7 @@ static bool cxl_pci_enable_device_hook(struct pci_dev *dev)
 	phb = pci_bus_to_host(dev->bus);
 	afu = (struct cxl_afu *)phb->private_data;
 
-	if (!cxl_adapter_link_ok(afu->adapter)) {
+	if (!cxl_ops->link_ok(afu->adapter, afu)) {
 		dev_warn(&dev->dev, "%s: Device link is down, refusing to enable AFU\n", __func__);
 		return false;
 	}
@@ -66,7 +66,7 @@ static bool cxl_pci_enable_device_hook(struct pci_dev *dev)
 		return false;
 	dev->dev.archdata.cxl_ctx = ctx;
 
-	return (cxl_afu_check_and_enable(afu) == 0);
+	return (cxl_ops->afu_check_and_enable(afu) == 0);
 }
 
 static void cxl_pci_disable_device(struct pci_dev *dev)
@@ -99,113 +99,90 @@ static int cxl_pcie_cfg_record(u8 bus, u8 devfn)
 	return (bus << 8) + devfn;
 }
 
-static unsigned long cxl_pcie_cfg_addr(struct pci_controller* phb,
-				       u8 bus, u8 devfn, int offset)
-{
-	int record = cxl_pcie_cfg_record(bus, devfn);
-
-	return (unsigned long)phb->cfg_addr + ((unsigned long)phb->cfg_data * record) + offset;
-}
-
-
 static int cxl_pcie_config_info(struct pci_bus *bus, unsigned int devfn,
-				int offset, int len,
-				volatile void __iomem **ioaddr,
-				u32 *mask, int *shift)
+				struct cxl_afu **_afu, int *_record)
 {
 	struct pci_controller *phb;
 	struct cxl_afu *afu;
-	unsigned long addr;
+	int record;
 
 	phb = pci_bus_to_host(bus);
 	if (phb == NULL)
 		return PCIBIOS_DEVICE_NOT_FOUND;
-	afu = (struct cxl_afu *)phb->private_data;
 
-	if (cxl_pcie_cfg_record(bus->number, devfn) > afu->crs_num)
+	afu = (struct cxl_afu *)phb->private_data;
+	record = cxl_pcie_cfg_record(bus->number, devfn);
+	if (record > afu->crs_num)
 		return PCIBIOS_DEVICE_NOT_FOUND;
-	if (offset >= (unsigned long)phb->cfg_data)
-		return PCIBIOS_BAD_REGISTER_NUMBER;
-	addr = cxl_pcie_cfg_addr(phb, bus->number, devfn, offset);
 
-	*ioaddr = (void *)(addr & ~0x3ULL);
-	*shift = ((addr & 0x3) * 8);
-	switch (len) {
-	case 1:
-		*mask = 0xff;
-		break;
-	case 2:
-		*mask = 0xffff;
-		break;
-	default:
-		*mask = 0xffffffff;
-		break;
-	}
+	*_afu = afu;
+	*_record = record;
 	return 0;
 }
 
-
-static inline bool cxl_config_link_ok(struct pci_bus *bus)
-{
-	struct pci_controller *phb;
-	struct cxl_afu *afu;
-
-	/* Config space IO is based on phb->cfg_addr, which is based on
-	 * afu_desc_mmio. This isn't safe to read/write when the link
-	 * goes down, as EEH tears down MMIO space.
-	 *
-	 * Check if the link is OK before proceeding.
-	 */
-
-	phb = pci_bus_to_host(bus);
-	if (phb == NULL)
-		return false;
-	afu = (struct cxl_afu *)phb->private_data;
-	return cxl_adapter_link_ok(afu->adapter);
-}
-
 static int cxl_pcie_read_config(struct pci_bus *bus, unsigned int devfn,
 				int offset, int len, u32 *val)
 {
-	volatile void __iomem *ioaddr;
-	int shift, rc;
-	u32 mask;
+	int rc, record;
+	struct cxl_afu *afu;
+	u8 val8;
+	u16 val16;
+	u32 val32;
 
-	rc = cxl_pcie_config_info(bus, devfn, offset, len, &ioaddr,
-				  &mask, &shift);
+	rc = cxl_pcie_config_info(bus, devfn, &afu, &record);
 	if (rc)
 		return rc;
 
-	if (!cxl_config_link_ok(bus))
+	switch (len) {
+	case 1:
+		rc = cxl_ops->afu_cr_read8(afu, record, offset,	&val8);
+		*val = val8;
+		break;
+	case 2:
+		rc = cxl_ops->afu_cr_read16(afu, record, offset, &val16);
+		*val = val16;
+		break;
+	case 4:
+		rc = cxl_ops->afu_cr_read32(afu, record, offset, &val32);
+		*val = val32;
+		break;
+	default:
+		WARN_ON(1);
+	}
+
+	if (rc)
 		return PCIBIOS_DEVICE_NOT_FOUND;
 
-	/* Can only read 32 bits */
-	*val = (in_le32(ioaddr) >> shift) & mask;
 	return PCIBIOS_SUCCESSFUL;
 }
 
 static int cxl_pcie_write_config(struct pci_bus *bus, unsigned int devfn,
 				 int offset, int len, u32 val)
 {
-	volatile void __iomem *ioaddr;
-	u32 v, mask;
-	int shift, rc;
+	int rc, record;
+	struct cxl_afu *afu;
 
-	rc = cxl_pcie_config_info(bus, devfn, offset, len, &ioaddr,
-				  &mask, &shift);
+	rc = cxl_pcie_config_info(bus, devfn, &afu, &record);
 	if (rc)
 		return rc;
 
-	if (!cxl_config_link_ok(bus))
-		return PCIBIOS_DEVICE_NOT_FOUND;
-
-	/* Can only write 32 bits so do read-modify-write */
-	mask <<= shift;
-	val <<= shift;
+	switch (len) {
+	case 1:
+		rc = cxl_ops->afu_cr_write8(afu, record, offset, val & 0xff);
+		break;
+	case 2:
+		rc = cxl_ops->afu_cr_write16(afu, record, offset, val & 0xffff);
+		break;
+	case 4:
+		rc = cxl_ops->afu_cr_write32(afu, record, offset, val);
+		break;
+	default:
+		WARN_ON(1);
+	}
 
-	v = (in_le32(ioaddr) & ~mask) | (val & mask);
+	if (rc)
+		return PCIBIOS_SET_FAILED;
 
-	out_le32(ioaddr, v);
 	return PCIBIOS_SUCCESSFUL;
 }
 
@@ -233,23 +210,31 @@ int cxl_pci_vphb_add(struct cxl_afu *afu)
 {
 	struct pci_dev *phys_dev;
 	struct pci_controller *phb, *phys_phb;
-
-	phys_dev = to_pci_dev(afu->adapter->dev.parent);
-	phys_phb = pci_bus_to_host(phys_dev->bus);
+	struct device_node *vphb_dn;
+	struct device *parent;
+
+	if (cpu_has_feature(CPU_FTR_HVMODE)) {
+		phys_dev = to_pci_dev(afu->adapter->dev.parent);
+		phys_phb = pci_bus_to_host(phys_dev->bus);
+		vphb_dn = phys_phb->dn;
+		parent = &phys_dev->dev;
+	} else {
+		vphb_dn = afu->adapter->dev.parent->of_node;
+		parent = afu->adapter->dev.parent;
+	}
 
 	/* Alloc and setup PHB data structure */
-	phb = pcibios_alloc_controller(phys_phb->dn);
-
+	phb = pcibios_alloc_controller(vphb_dn);
 	if (!phb)
 		return -ENODEV;
 
 	/* Setup parent in sysfs */
-	phb->parent = &phys_dev->dev;
+	phb->parent = parent;
 
 	/* Setup the PHB using arch provided callback */
 	phb->ops = &cxl_pcie_pci_ops;
-	phb->cfg_addr = afu->afu_desc_mmio + afu->crs_offset;
-	phb->cfg_data = (void *)(u64)afu->crs_len;
+	phb->cfg_addr = NULL;
+	phb->cfg_data = 0;
 	phb->private_data = afu;
 	phb->controller_ops = cxl_pci_controller_ops;
 
@@ -272,15 +257,6 @@ int cxl_pci_vphb_add(struct cxl_afu *afu)
 	return 0;
 }
 
-void cxl_pci_vphb_reconfigure(struct cxl_afu *afu)
-{
-	/* When we are reconfigured, the AFU's MMIO space is unmapped
-	 * and remapped. We need to reflect this in the PHB's view of
-	 * the world.
-	 */
-	afu->phb->cfg_addr = afu->afu_desc_mmio + afu->crs_offset;
-}
-
 void cxl_pci_vphb_remove(struct cxl_afu *afu)
 {
 	struct pci_controller *phb;
@@ -296,6 +272,15 @@ void cxl_pci_vphb_remove(struct cxl_afu *afu)
 	pcibios_free_controller(phb);
 }
 
+bool cxl_pci_is_vphb_device(struct pci_dev *dev)
+{
+	struct pci_controller *phb;
+
+	phb = pci_bus_to_host(dev->bus);
+
+	return (phb->ops == &cxl_pcie_pci_ops);
+}
+
 struct cxl_afu *cxl_pci_to_afu(struct pci_dev *dev)
 {
 	struct pci_controller *phb;
diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index 89b3befc7155..6469ff6208b0 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -271,6 +271,8 @@ bool pci_bus_clip_resource(struct pci_dev *dev, int idx)
 
 void __weak pcibios_resource_survey_bus(struct pci_bus *bus) { }
 
+void __weak pcibios_bus_add_device(struct pci_dev *pdev) { }
+
 /**
  * pci_bus_add_device - start driver for a single device
  * @dev: device to add
@@ -285,6 +287,7 @@ void pci_bus_add_device(struct pci_dev *dev)
 	 * Can not put in pci_device_add yet because resources
 	 * are not assigned yet for some devices.
 	 */
+	pcibios_bus_add_device(dev);
 	pci_fixup_device(pci_fixup_final, dev);
 	pci_create_sysfs_dev_files(dev);
 	pci_proc_attach_device(dev);
diff --git a/drivers/pci/iov.c b/drivers/pci/iov.c
index 31f31d460fc9..fa4f13869fa9 100644
--- a/drivers/pci/iov.c
+++ b/drivers/pci/iov.c
@@ -113,7 +113,7 @@ resource_size_t pci_iov_resource_size(struct pci_dev *dev, int resno)
 	return dev->sriov->barsz[resno - PCI_IOV_RESOURCES];
 }
 
-static int virtfn_add(struct pci_dev *dev, int id, int reset)
+int pci_iov_add_virtfn(struct pci_dev *dev, int id, int reset)
 {
 	int i;
 	int rc = -ENOMEM;
@@ -188,7 +188,7 @@ failed:
 	return rc;
 }
 
-static void virtfn_remove(struct pci_dev *dev, int id, int reset)
+void pci_iov_remove_virtfn(struct pci_dev *dev, int id, int reset)
 {
 	char buf[VIRTFN_ID_LEN];
 	struct pci_dev *virtfn;
@@ -321,7 +321,7 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
 	}
 
 	for (i = 0; i < initial; i++) {
-		rc = virtfn_add(dev, i, 0);
+		rc = pci_iov_add_virtfn(dev, i, 0);
 		if (rc)
 			goto failed;
 	}
@@ -333,7 +333,7 @@ static int sriov_enable(struct pci_dev *dev, int nr_virtfn)
 
 failed:
 	while (i--)
-		virtfn_remove(dev, i, 0);
+		pci_iov_remove_virtfn(dev, i, 0);
 
 	pcibios_sriov_disable(dev);
 err_pcibios:
@@ -359,7 +359,7 @@ static void sriov_disable(struct pci_dev *dev)
 		return;
 
 	for (i = 0; i < iov->num_VFs; i++)
-		virtfn_remove(dev, i, 0);
+		pci_iov_remove_virtfn(dev, i, 0);
 
 	pcibios_sriov_disable(dev);
 
diff --git a/drivers/scsi/cxlflash/common.h b/drivers/scsi/cxlflash/common.h
index 5ada9268a450..580f37006977 100644
--- a/drivers/scsi/cxlflash/common.h
+++ b/drivers/scsi/cxlflash/common.h
@@ -106,7 +106,6 @@ struct cxlflash_cfg {
 	atomic_t scan_host_needed;
 
 	struct cxl_afu *cxl_afu;
-	struct pci_dev *parent_dev;
 
 	atomic_t recovery_threads;
 	struct mutex ctx_recovery_mutex;
diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c
index f6d90ce8f3b7..e04aae70643b 100644
--- a/drivers/scsi/cxlflash/main.c
+++ b/drivers/scsi/cxlflash/main.c
@@ -1407,7 +1407,7 @@ static int start_context(struct cxlflash_cfg *cfg)
  */
 static int read_vpd(struct cxlflash_cfg *cfg, u64 wwpn[])
 {
-	struct pci_dev *dev = cfg->parent_dev;
+	struct pci_dev *dev = cfg->dev;
 	int rc = 0;
 	int ro_start, ro_size, i, j, k;
 	ssize_t vpd_size;
@@ -1416,7 +1416,7 @@ static int read_vpd(struct cxlflash_cfg *cfg, u64 wwpn[])
 	char *wwpn_vpd_tags[NUM_FC_PORTS] = { "V5", "V6" };
 
 	/* Get the VPD data from the device */
-	vpd_size = pci_read_vpd(dev, 0, sizeof(vpd_data), vpd_data);
+	vpd_size = cxl_read_adapter_vpd(dev, vpd_data, sizeof(vpd_data));
 	if (unlikely(vpd_size <= 0)) {
 		dev_err(&dev->dev, "%s: Unable to read VPD (size = %ld)\n",
 		       __func__, vpd_size);
@@ -2392,7 +2392,6 @@ static int cxlflash_probe(struct pci_dev *pdev,
 {
 	struct Scsi_Host *host;
 	struct cxlflash_cfg *cfg = NULL;
-	struct device *phys_dev;
 	struct dev_dependent_vals *ddv;
 	int rc = 0;
 
@@ -2458,19 +2457,6 @@ static int cxlflash_probe(struct pci_dev *pdev,
 
 	pci_set_drvdata(pdev, cfg);
 
-	/*
-	 * Use the special service provided to look up the physical
-	 * PCI device, since we are called on the probe of the virtual
-	 * PCI host bus (vphb)
-	 */
-	phys_dev = cxl_get_phys_dev(pdev);
-	if (!dev_is_pci(phys_dev)) {
-		dev_err(&pdev->dev, "%s: not a pci dev\n", __func__);
-		rc = -ENODEV;
-		goto out_remove;
-	}
-	cfg->parent_dev = to_pci_dev(phys_dev);
-
 	cfg->cxl_afu = cxl_pci_to_afu(pdev);
 
 	rc = init_pci(cfg);
diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 0b3c0d39ef75..c370b261c720 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -239,6 +239,14 @@ extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
 			    pmd_t *pmdp);
 #endif
 
+#ifndef __HAVE_ARCH_PMDP_HUGE_SPLIT_PREPARE
+static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma,
+					   unsigned long address, pmd_t *pmdp)
+{
+
+}
+#endif
+
 #ifndef __HAVE_ARCH_PTE_SAME
 static inline int pte_same(pte_t pte_a, pte_t pte_b)
 {
diff --git a/include/linux/atomic.h b/include/linux/atomic.h
index 301de78d65f7..5f3ee5a60a81 100644
--- a/include/linux/atomic.h
+++ b/include/linux/atomic.h
@@ -34,20 +34,29 @@
  * The idea here is to build acquire/release variants by adding explicit
  * barriers on top of the relaxed variant. In the case where the relaxed
  * variant is already fully ordered, no additional barriers are needed.
+ *
+ * Besides, if an arch has a special barrier for acquire/release, it could
+ * implement its own __atomic_op_* and use the same framework for building
+ * variants
  */
+#ifndef __atomic_op_acquire
 #define __atomic_op_acquire(op, args...)				\
 ({									\
 	typeof(op##_relaxed(args)) __ret  = op##_relaxed(args);		\
 	smp_mb__after_atomic();						\
 	__ret;								\
 })
+#endif
 
+#ifndef __atomic_op_release
 #define __atomic_op_release(op, args...)				\
 ({									\
 	smp_mb__before_atomic();					\
 	op##_relaxed(args);						\
 })
+#endif
 
+#ifndef __atomic_op_fence
 #define __atomic_op_fence(op, args...)					\
 ({									\
 	typeof(op##_relaxed(args)) __ret;				\
@@ -56,6 +65,7 @@
 	smp_mb__after_atomic();						\
 	__ret;								\
 })
+#endif
 
 /* atomic_add_return_relaxed */
 #ifndef atomic_add_return_relaxed
diff --git a/include/linux/bug.h b/include/linux/bug.h
index 7f4818673c41..e51b0709e78d 100644
--- a/include/linux/bug.h
+++ b/include/linux/bug.h
@@ -20,6 +20,7 @@ struct pt_regs;
 #define BUILD_BUG_ON_MSG(cond, msg) (0)
 #define BUILD_BUG_ON(condition) (0)
 #define BUILD_BUG() (0)
+#define MAYBE_BUILD_BUG_ON(cond) (0)
 #else /* __CHECKER__ */
 
 /* Force a compilation error if a constant expression is not a power of 2 */
@@ -83,6 +84,14 @@ struct pt_regs;
  */
 #define BUILD_BUG() BUILD_BUG_ON_MSG(1, "BUILD_BUG failed")
 
+#define MAYBE_BUILD_BUG_ON(cond)			\
+	do {						\
+		if (__builtin_constant_p((cond)))       \
+			BUILD_BUG_ON(cond);             \
+		else                                    \
+			BUG_ON(cond);                   \
+	} while (0)
+
 #endif	/* __CHECKER__ */
 
 #ifdef CONFIG_GENERIC_BUG
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 459fd25b378e..f12513a20a06 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -111,9 +111,6 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 			__split_huge_pmd(__vma, __pmd, __address);	\
 	}  while (0)
 
-#if HPAGE_PMD_ORDER >= MAX_ORDER
-#error "hugepages can't be allocated by the buddy allocator"
-#endif
 extern int hugepage_madvise(struct vm_area_struct *vma,
 			    unsigned long *vm_flags, int advice);
 extern void vma_adjust_trans_huge(struct vm_area_struct *vma,
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 27df4a6585da..bc435d6293d2 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -770,6 +770,7 @@ extern struct list_head pci_root_buses;	/* list of all known PCI buses */
 int no_pci_devices(void);
 
 void pcibios_resource_survey_bus(struct pci_bus *bus);
+void pcibios_bus_add_device(struct pci_dev *pdev);
 void pcibios_add_bus(struct pci_bus *bus);
 void pcibios_remove_bus(struct pci_bus *bus);
 void pcibios_fixup_bus(struct pci_bus *);
@@ -1738,6 +1739,8 @@ int pci_iov_virtfn_devfn(struct pci_dev *dev, int id);
 
 int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn);
 void pci_disable_sriov(struct pci_dev *dev);
+int pci_iov_add_virtfn(struct pci_dev *dev, int id, int reset);
+void pci_iov_remove_virtfn(struct pci_dev *dev, int id, int reset);
 int pci_num_vf(struct pci_dev *dev);
 int pci_vfs_assigned(struct pci_dev *dev);
 int pci_sriov_set_totalvfs(struct pci_dev *dev, u16 numvfs);
@@ -1754,6 +1757,12 @@ static inline int pci_iov_virtfn_devfn(struct pci_dev *dev, int id)
 }
 static inline int pci_enable_sriov(struct pci_dev *dev, int nr_virtfn)
 { return -ENODEV; }
+static inline int pci_iov_add_virtfn(struct pci_dev *dev, int id, int reset)
+{
+	return -ENOSYS;
+}
+static inline void pci_iov_remove_virtfn(struct pci_dev *dev,
+					 int id, int reset) { }
 static inline void pci_disable_sriov(struct pci_dev *dev) { }
 static inline int pci_num_vf(struct pci_dev *dev) { return 0; }
 static inline int pci_vfs_assigned(struct pci_dev *dev)
diff --git a/include/misc/cxl.h b/include/misc/cxl.h
index f2ffe5bd720d..7d5e2613c7b8 100644
--- a/include/misc/cxl.h
+++ b/include/misc/cxl.h
@@ -30,9 +30,6 @@ struct cxl_afu *cxl_pci_to_afu(struct pci_dev *dev);
 /* Get the AFU conf record number associated with a pci_dev */
 unsigned int cxl_pci_to_cfg_record(struct pci_dev *dev);
 
-/* Get the physical device (ie. the PCIe card) which the AFU is attached */
-struct device *cxl_get_phys_dev(struct pci_dev *dev);
-
 
 /*
  * Context lifetime overview:
@@ -210,4 +207,9 @@ ssize_t cxl_fd_read(struct file *file, char __user *buf, size_t count,
 void cxl_perst_reloads_same_image(struct cxl_afu *afu,
 				  bool perst_reloads_same_image);
 
+/*
+ * Read the VPD for the card where the AFU resides
+ */
+ssize_t cxl_read_adapter_vpd(struct pci_dev *dev, void *buf, size_t count);
+
 #endif /* _MISC_CXL_H */
diff --git a/include/uapi/misc/cxl.h b/include/uapi/misc/cxl.h
index 1e889aa8a36e..8cd334f99ddc 100644
--- a/include/uapi/misc/cxl.h
+++ b/include/uapi/misc/cxl.h
@@ -55,11 +55,35 @@ struct cxl_afu_id {
 	__u64 reserved6;
 };
 
+/* base adapter image header is included in the image */
+#define CXL_AI_NEED_HEADER	0x0000000000000001ULL
+#define CXL_AI_ALL		CXL_AI_NEED_HEADER
+
+#define CXL_AI_HEADER_SIZE 128
+#define CXL_AI_BUFFER_SIZE 4096
+#define CXL_AI_MAX_ENTRIES 256
+#define CXL_AI_MAX_CHUNK_SIZE (CXL_AI_BUFFER_SIZE * CXL_AI_MAX_ENTRIES)
+
+struct cxl_adapter_image {
+	__u64 flags;
+	__u64 data;
+	__u64 len_data;
+	__u64 len_image;
+	__u64 reserved1;
+	__u64 reserved2;
+	__u64 reserved3;
+	__u64 reserved4;
+};
+
 /* ioctl numbers */
 #define CXL_MAGIC 0xCA
+/* AFU devices */
 #define CXL_IOCTL_START_WORK		_IOW(CXL_MAGIC, 0x00, struct cxl_ioctl_start_work)
 #define CXL_IOCTL_GET_PROCESS_ELEMENT	_IOR(CXL_MAGIC, 0x01, __u32)
 #define CXL_IOCTL_GET_AFU_ID            _IOR(CXL_MAGIC, 0x02, struct cxl_afu_id)
+/* adapter devices */
+#define CXL_IOCTL_DOWNLOAD_IMAGE        _IOW(CXL_MAGIC, 0x0A, struct cxl_adapter_image)
+#define CXL_IOCTL_VALIDATE_IMAGE        _IOW(CXL_MAGIC, 0x0B, struct cxl_adapter_image)
 
 #define CXL_READ_MIN_SIZE 0x1000 /* 4K */
 
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 08fc0ba2207e..36c22a89df61 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -83,7 +83,7 @@ unsigned long transparent_hugepage_flags __read_mostly =
 	(1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
 
 /* default scan 8*512 pte (or vmas) every 30 second */
-static unsigned int khugepaged_pages_to_scan __read_mostly = HPAGE_PMD_NR*8;
+static unsigned int khugepaged_pages_to_scan __read_mostly;
 static unsigned int khugepaged_pages_collapsed;
 static unsigned int khugepaged_full_scans;
 static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
@@ -98,7 +98,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
  * it would have happened if the vma was large enough during page
  * fault.
  */
-static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
+static unsigned int khugepaged_max_ptes_none __read_mostly;
 
 static int khugepaged(void *none);
 static int khugepaged_slab_init(void);
@@ -660,6 +660,18 @@ static int __init hugepage_init(void)
 		return -EINVAL;
 	}
 
+	khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
+	khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
+	/*
+	 * hugepages can't be allocated by the buddy allocator
+	 */
+	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER >= MAX_ORDER);
+	/*
+	 * we use page->mapping and page->index in second tail page
+	 * as list_head: assuming THP order >= 2
+	 */
+	MAYBE_BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
+
 	err = hugepage_init_sysfs(&hugepage_kobj);
 	if (err)
 		goto err_sysfs;
@@ -764,7 +776,6 @@ void prep_transhuge_page(struct page *page)
 	 * we use page->mapping and page->indexlru in second tail page
 	 * as list_head: assuming THP order >= 2
 	 */
-	BUILD_BUG_ON(HPAGE_PMD_ORDER < 2);
 
 	INIT_LIST_HEAD(page_deferred_list(page));
 	set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
@@ -2860,6 +2871,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
 	young = pmd_young(*pmd);
 	dirty = pmd_dirty(*pmd);
 
+	pmdp_huge_split_prepare(vma, haddr, pmd);
 	pgtable = pgtable_trans_huge_withdraw(mm, pmd);
 	pmd_populate(mm, &_pmd, pgtable);
 
diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile
index 0c2706bda330..b08f77cbe31b 100644
--- a/tools/testing/selftests/powerpc/Makefile
+++ b/tools/testing/selftests/powerpc/Makefile
@@ -8,7 +8,7 @@ ifeq ($(ARCH),powerpc)
 
 GIT_VERSION = $(shell git describe --always --long --dirty || echo "unknown")
 
-CFLAGS := -Wall -O2 -flto -Wall -Werror -DGIT_VERSION='"$(GIT_VERSION)"' -I$(CURDIR) $(CFLAGS)
+CFLAGS := -Wall -O2 -Wall -Werror -DGIT_VERSION='"$(GIT_VERSION)"' -I$(CURDIR) $(CFLAGS)
 
 export CFLAGS
 
@@ -22,7 +22,8 @@ SUB_DIRS = benchmarks 		\
 	   switch_endian	\
 	   syscalls		\
 	   tm			\
-	   vphn
+	   vphn         \
+	   math
 
 endif
 
diff --git a/tools/testing/selftests/powerpc/basic_asm.h b/tools/testing/selftests/powerpc/basic_asm.h
new file mode 100644
index 000000000000..3349a0704d1a
--- /dev/null
+++ b/tools/testing/selftests/powerpc/basic_asm.h
@@ -0,0 +1,70 @@
+#ifndef _SELFTESTS_POWERPC_BASIC_ASM_H
+#define _SELFTESTS_POWERPC_BASIC_ASM_H
+
+#include <ppc-asm.h>
+#include <asm/unistd.h>
+
+#define LOAD_REG_IMMEDIATE(reg,expr) \
+	lis	reg,(expr)@highest;	\
+	ori	reg,reg,(expr)@higher;	\
+	rldicr	reg,reg,32,31;	\
+	oris	reg,reg,(expr)@high;	\
+	ori	reg,reg,(expr)@l;
+
+/*
+ * Note: These macros assume that variables being stored on the stack are
+ * doublewords, while this is usually the case it may not always be the
+ * case for each use case.
+ */
+#if defined(_CALL_ELF) && _CALL_ELF == 2
+#define STACK_FRAME_MIN_SIZE 32
+#define STACK_FRAME_TOC_POS  24
+#define __STACK_FRAME_PARAM(_param)  (32 + ((_param)*8))
+#define __STACK_FRAME_LOCAL(_num_params,_var_num)  ((STACK_FRAME_PARAM(_num_params)) + ((_var_num)*8))
+#else
+#define STACK_FRAME_MIN_SIZE 112
+#define STACK_FRAME_TOC_POS  40
+#define __STACK_FRAME_PARAM(i)  (48 + ((i)*8))
+
+/*
+ * Caveat: if a function passed more than 8 doublewords, the caller will have
+ * made more space... which would render the 112 incorrect.
+ */
+#define __STACK_FRAME_LOCAL(_num_params,_var_num)  (112 + ((_var_num)*8))
+#endif
+
+/* Parameter x saved to the stack */
+#define STACK_FRAME_PARAM(var)    __STACK_FRAME_PARAM(var)
+
+/* Local variable x saved to the stack after x parameters */
+#define STACK_FRAME_LOCAL(num_params,var)    __STACK_FRAME_LOCAL(num_params,var)
+#define STACK_FRAME_LR_POS   16
+#define STACK_FRAME_CR_POS   8
+
+/*
+ * It is very important to note here that _extra is the extra amount of
+ * stack space needed. This space can be accessed using STACK_FRAME_PARAM()
+ * or STACK_FRAME_LOCAL() macros.
+ *
+ * r1 and r2 are not defined in ppc-asm.h (instead they are defined as sp
+ * and toc). Kernel programmers tend to prefer rX even for r1 and r2, hence
+ * %1 and %r2. r0 is defined in ppc-asm.h and therefore %r0 gets
+ * preprocessed incorrectly, hence r0.
+ */
+#define PUSH_BASIC_STACK(_extra) \
+	mflr	r0; \
+	std	r0,STACK_FRAME_LR_POS(%r1); \
+	stdu	%r1,-(_extra + STACK_FRAME_MIN_SIZE)(%r1); \
+	mfcr	r0; \
+	stw	r0,STACK_FRAME_CR_POS(%r1); \
+	std	%r2,STACK_FRAME_TOC_POS(%r1);
+
+#define POP_BASIC_STACK(_extra) \
+	ld	%r2,STACK_FRAME_TOC_POS(%r1); \
+	lwz	r0,STACK_FRAME_CR_POS(%r1); \
+	mtcr	r0; \
+	addi	%r1,%r1,(_extra + STACK_FRAME_MIN_SIZE); \
+	ld	r0,STACK_FRAME_LR_POS(%r1); \
+	mtlr	r0;
+
+#endif /* _SELFTESTS_POWERPC_BASIC_ASM_H */
diff --git a/tools/testing/selftests/powerpc/math/.gitignore b/tools/testing/selftests/powerpc/math/.gitignore
new file mode 100644
index 000000000000..4fe13a439fd7
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/.gitignore
@@ -0,0 +1,6 @@
+fpu_syscall
+vmx_syscall
+fpu_preempt
+vmx_preempt
+fpu_signal
+vmx_signal
diff --git a/tools/testing/selftests/powerpc/math/Makefile b/tools/testing/selftests/powerpc/math/Makefile
new file mode 100644
index 000000000000..5b88875d5955
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/Makefile
@@ -0,0 +1,19 @@
+TEST_PROGS := fpu_syscall fpu_preempt fpu_signal vmx_syscall vmx_preempt vmx_signal
+
+all: $(TEST_PROGS)
+
+$(TEST_PROGS): ../harness.c
+$(TEST_PROGS): CFLAGS += -O2 -g -pthread -m64 -maltivec
+
+fpu_syscall: fpu_asm.S
+fpu_preempt: fpu_asm.S
+fpu_signal:  fpu_asm.S
+
+vmx_syscall: vmx_asm.S
+vmx_preempt: vmx_asm.S
+vmx_signal: vmx_asm.S
+
+include ../../lib.mk
+
+clean:
+	rm -f $(TEST_PROGS) *.o
diff --git a/tools/testing/selftests/powerpc/math/fpu_asm.S b/tools/testing/selftests/powerpc/math/fpu_asm.S
new file mode 100644
index 000000000000..f3711d80e709
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/fpu_asm.S
@@ -0,0 +1,198 @@
+/*
+ * Copyright 2015, Cyril Bur, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include "../basic_asm.h"
+
+#define PUSH_FPU(pos) \
+	stfd	f14,pos(sp); \
+	stfd	f15,pos+8(sp); \
+	stfd	f16,pos+16(sp); \
+	stfd	f17,pos+24(sp); \
+	stfd	f18,pos+32(sp); \
+	stfd	f19,pos+40(sp); \
+	stfd	f20,pos+48(sp); \
+	stfd	f21,pos+56(sp); \
+	stfd	f22,pos+64(sp); \
+	stfd	f23,pos+72(sp); \
+	stfd	f24,pos+80(sp); \
+	stfd	f25,pos+88(sp); \
+	stfd	f26,pos+96(sp); \
+	stfd	f27,pos+104(sp); \
+	stfd	f28,pos+112(sp); \
+	stfd	f29,pos+120(sp); \
+	stfd	f30,pos+128(sp); \
+	stfd	f31,pos+136(sp);
+
+#define POP_FPU(pos) \
+	lfd	f14,pos(sp); \
+	lfd	f15,pos+8(sp); \
+	lfd	f16,pos+16(sp); \
+	lfd	f17,pos+24(sp); \
+	lfd	f18,pos+32(sp); \
+	lfd	f19,pos+40(sp); \
+	lfd	f20,pos+48(sp); \
+	lfd	f21,pos+56(sp); \
+	lfd	f22,pos+64(sp); \
+	lfd	f23,pos+72(sp); \
+	lfd	f24,pos+80(sp); \
+	lfd	f25,pos+88(sp); \
+	lfd	f26,pos+96(sp); \
+	lfd	f27,pos+104(sp); \
+	lfd	f28,pos+112(sp); \
+	lfd	f29,pos+120(sp); \
+	lfd	f30,pos+128(sp); \
+	lfd	f31,pos+136(sp);
+
+# Careful calling this, it will 'clobber' fpu (by design)
+# Don't call this from C
+FUNC_START(load_fpu)
+	lfd	f14,0(r3)
+	lfd	f15,8(r3)
+	lfd	f16,16(r3)
+	lfd	f17,24(r3)
+	lfd	f18,32(r3)
+	lfd	f19,40(r3)
+	lfd	f20,48(r3)
+	lfd	f21,56(r3)
+	lfd	f22,64(r3)
+	lfd	f23,72(r3)
+	lfd	f24,80(r3)
+	lfd	f25,88(r3)
+	lfd	f26,96(r3)
+	lfd	f27,104(r3)
+	lfd	f28,112(r3)
+	lfd	f29,120(r3)
+	lfd	f30,128(r3)
+	lfd	f31,136(r3)
+	blr
+FUNC_END(load_fpu)
+
+FUNC_START(check_fpu)
+	mr r4,r3
+	li	r3,1 # assume a bad result
+	lfd	f0,0(r4)
+	fcmpu	cr1,f0,f14
+	bne	cr1,1f
+	lfd	f0,8(r4)
+	fcmpu	cr1,f0,f15
+	bne	cr1,1f
+	lfd	f0,16(r4)
+	fcmpu	cr1,f0,f16
+	bne	cr1,1f
+	lfd	f0,24(r4)
+	fcmpu	cr1,f0,f17
+	bne	cr1,1f
+	lfd	f0,32(r4)
+	fcmpu	cr1,f0,f18
+	bne	cr1,1f
+	lfd	f0,40(r4)
+	fcmpu	cr1,f0,f19
+	bne	cr1,1f
+	lfd	f0,48(r4)
+	fcmpu	cr1,f0,f20
+	bne	cr1,1f
+	lfd	f0,56(r4)
+	fcmpu	cr1,f0,f21
+	bne	cr1,1f
+	lfd	f0,64(r4)
+	fcmpu	cr1,f0,f22
+	bne	cr1,1f
+	lfd	f0,72(r4)
+	fcmpu	cr1,f0,f23
+	bne	cr1,1f
+	lfd	f0,80(r4)
+	fcmpu	cr1,f0,f24
+	bne	cr1,1f
+	lfd	f0,88(r4)
+	fcmpu	cr1,f0,f25
+	bne	cr1,1f
+	lfd	f0,96(r4)
+	fcmpu	cr1,f0,f26
+	bne	cr1,1f
+	lfd	f0,104(r4)
+	fcmpu	cr1,f0,f27
+	bne	cr1,1f
+	lfd	f0,112(r4)
+	fcmpu	cr1,f0,f28
+	bne	cr1,1f
+	lfd	f0,120(r4)
+	fcmpu	cr1,f0,f29
+	bne	cr1,1f
+	lfd	f0,128(r4)
+	fcmpu	cr1,f0,f30
+	bne	cr1,1f
+	lfd	f0,136(r4)
+	fcmpu	cr1,f0,f31
+	bne	cr1,1f
+	li	r3,0 # Success!!!
+1:	blr
+
+FUNC_START(test_fpu)
+	# r3 holds pointer to where to put the result of fork
+	# r4 holds pointer to the pid
+	# f14-f31 are non volatiles
+	PUSH_BASIC_STACK(256)
+	std	r3,STACK_FRAME_PARAM(0)(sp) # Address of darray
+	std r4,STACK_FRAME_PARAM(1)(sp) # Address of pid
+	PUSH_FPU(STACK_FRAME_LOCAL(2,0))
+
+	bl load_fpu
+	nop
+	li	r0,__NR_fork
+	sc
+
+	# pass the result of the fork to the caller
+	ld	r9,STACK_FRAME_PARAM(1)(sp)
+	std	r3,0(r9)
+
+	ld r3,STACK_FRAME_PARAM(0)(sp)
+	bl check_fpu
+	nop
+
+	POP_FPU(STACK_FRAME_LOCAL(2,0))
+	POP_BASIC_STACK(256)
+	blr
+FUNC_END(test_fpu)
+
+# int preempt_fpu(double *darray, int *threads_running, int *running)
+# On starting will (atomically) decrement not_ready as a signal that the FPU
+# has been loaded with darray. Will proceed to check the validity of the FPU
+# registers while running is not zero.
+FUNC_START(preempt_fpu)
+	PUSH_BASIC_STACK(256)
+	std r3,STACK_FRAME_PARAM(0)(sp) # double *darray
+	std r4,STACK_FRAME_PARAM(1)(sp) # int *threads_starting
+	std r5,STACK_FRAME_PARAM(2)(sp) # int *running
+	PUSH_FPU(STACK_FRAME_LOCAL(3,0))
+
+	bl load_fpu
+	nop
+
+	sync
+	# Atomic DEC
+	ld r3,STACK_FRAME_PARAM(1)(sp)
+1:	lwarx r4,0,r3
+	addi r4,r4,-1
+	stwcx. r4,0,r3
+	bne- 1b
+
+2:	ld r3,STACK_FRAME_PARAM(0)(sp)
+	bl check_fpu
+	nop
+	cmpdi r3,0
+	bne 3f
+	ld r4,STACK_FRAME_PARAM(2)(sp)
+	ld r5,0(r4)
+	cmpwi r5,0
+	bne 2b
+
+3:	POP_FPU(STACK_FRAME_LOCAL(3,0))
+	POP_BASIC_STACK(256)
+	blr
+FUNC_END(preempt_fpu)
diff --git a/tools/testing/selftests/powerpc/math/fpu_preempt.c b/tools/testing/selftests/powerpc/math/fpu_preempt.c
new file mode 100644
index 000000000000..0f85b79d883d
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/fpu_preempt.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright 2015, Cyril Bur, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * This test attempts to see if the FPU registers change across preemption.
+ * Two things should be noted here a) The check_fpu function in asm only checks
+ * the non volatile registers as it is reused from the syscall test b) There is
+ * no way to be sure preemption happened so this test just uses many threads
+ * and a long wait. As such, a successful test doesn't mean much but a failure
+ * is bad.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <pthread.h>
+
+#include "utils.h"
+
+/* Time to wait for workers to get preempted (seconds) */
+#define PREEMPT_TIME 20
+/*
+ * Factor by which to multiply number of online CPUs for total number of
+ * worker threads
+ */
+#define THREAD_FACTOR 8
+
+
+__thread double darray[] = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
+		     1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0,
+		     2.1};
+
+int threads_starting;
+int running;
+
+extern void preempt_fpu(double *darray, int *threads_starting, int *running);
+
+void *preempt_fpu_c(void *p)
+{
+	int i;
+	srand(pthread_self());
+	for (i = 0; i < 21; i++)
+		darray[i] = rand();
+
+	/* Test failed if it ever returns */
+	preempt_fpu(darray, &threads_starting, &running);
+
+	return p;
+}
+
+int test_preempt_fpu(void)
+{
+	int i, rc, threads;
+	pthread_t *tids;
+
+	threads = sysconf(_SC_NPROCESSORS_ONLN) * THREAD_FACTOR;
+	tids = malloc((threads) * sizeof(pthread_t));
+	FAIL_IF(!tids);
+
+	running = true;
+	threads_starting = threads;
+	for (i = 0; i < threads; i++) {
+		rc = pthread_create(&tids[i], NULL, preempt_fpu_c, NULL);
+		FAIL_IF(rc);
+	}
+
+	setbuf(stdout, NULL);
+	/* Not really necessary but nice to wait for every thread to start */
+	printf("\tWaiting for all workers to start...");
+	while(threads_starting)
+		asm volatile("": : :"memory");
+	printf("done\n");
+
+	printf("\tWaiting for %d seconds to let some workers get preempted...", PREEMPT_TIME);
+	sleep(PREEMPT_TIME);
+	printf("done\n");
+
+	printf("\tStopping workers...");
+	/*
+	 * Working are checking this value every loop. In preempt_fpu 'cmpwi r5,0; bne 2b'.
+	 * r5 will have loaded the value of running.
+	 */
+	running = 0;
+	for (i = 0; i < threads; i++) {
+		void *rc_p;
+		pthread_join(tids[i], &rc_p);
+
+		/*
+		 * Harness will say the fail was here, look at why preempt_fpu
+		 * returned
+		 */
+		if ((long) rc_p)
+			printf("oops\n");
+		FAIL_IF((long) rc_p);
+	}
+	printf("done\n");
+
+	free(tids);
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	return test_harness(test_preempt_fpu, "fpu_preempt");
+}
diff --git a/tools/testing/selftests/powerpc/math/fpu_signal.c b/tools/testing/selftests/powerpc/math/fpu_signal.c
new file mode 100644
index 000000000000..888aa51b4204
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/fpu_signal.c
@@ -0,0 +1,135 @@
+/*
+ * Copyright 2015, Cyril Bur, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * This test attempts to see if the FPU registers are correctly reported in a
+ * signal context. Each worker just spins checking its FPU registers, at some
+ * point a signal will interrupt it and C code will check the signal context
+ * ensuring it is also the same.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <pthread.h>
+
+#include "utils.h"
+
+/* Number of times each thread should receive the signal */
+#define ITERATIONS 10
+/*
+ * Factor by which to multiply number of online CPUs for total number of
+ * worker threads
+ */
+#define THREAD_FACTOR 8
+
+__thread double darray[] = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
+		     1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0,
+		     2.1};
+
+bool bad_context;
+int threads_starting;
+int running;
+
+extern long preempt_fpu(double *darray, int *threads_starting, int *running);
+
+void signal_fpu_sig(int sig, siginfo_t *info, void *context)
+{
+	int i;
+	ucontext_t *uc = context;
+	mcontext_t *mc = &uc->uc_mcontext;
+
+	/* Only the non volatiles were loaded up */
+	for (i = 14; i < 32; i++) {
+		if (mc->fp_regs[i] != darray[i - 14]) {
+			bad_context = true;
+			break;
+		}
+	}
+}
+
+void *signal_fpu_c(void *p)
+{
+	int i;
+	long rc;
+	struct sigaction act;
+	act.sa_sigaction = signal_fpu_sig;
+	act.sa_flags = SA_SIGINFO;
+	rc = sigaction(SIGUSR1, &act, NULL);
+	if (rc)
+		return p;
+
+	srand(pthread_self());
+	for (i = 0; i < 21; i++)
+		darray[i] = rand();
+
+	rc = preempt_fpu(darray, &threads_starting, &running);
+
+	return (void *) rc;
+}
+
+int test_signal_fpu(void)
+{
+	int i, j, rc, threads;
+	void *rc_p;
+	pthread_t *tids;
+
+	threads = sysconf(_SC_NPROCESSORS_ONLN) * THREAD_FACTOR;
+	tids = malloc(threads * sizeof(pthread_t));
+	FAIL_IF(!tids);
+
+	running = true;
+	threads_starting = threads;
+	for (i = 0; i < threads; i++) {
+		rc = pthread_create(&tids[i], NULL, signal_fpu_c, NULL);
+		FAIL_IF(rc);
+	}
+
+	setbuf(stdout, NULL);
+	printf("\tWaiting for all workers to start...");
+	while (threads_starting)
+		asm volatile("": : :"memory");
+	printf("done\n");
+
+	printf("\tSending signals to all threads %d times...", ITERATIONS);
+	for (i = 0; i < ITERATIONS; i++) {
+		for (j = 0; j < threads; j++) {
+			pthread_kill(tids[j], SIGUSR1);
+		}
+		sleep(1);
+	}
+	printf("done\n");
+
+	printf("\tStopping workers...");
+	running = 0;
+	for (i = 0; i < threads; i++) {
+		pthread_join(tids[i], &rc_p);
+
+		/*
+		 * Harness will say the fail was here, look at why signal_fpu
+		 * returned
+		 */
+		if ((long) rc_p || bad_context)
+			printf("oops\n");
+		if (bad_context)
+			fprintf(stderr, "\t!! bad_context is true\n");
+		FAIL_IF((long) rc_p || bad_context);
+	}
+	printf("done\n");
+
+	free(tids);
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	return test_harness(test_signal_fpu, "fpu_signal");
+}
diff --git a/tools/testing/selftests/powerpc/math/fpu_syscall.c b/tools/testing/selftests/powerpc/math/fpu_syscall.c
new file mode 100644
index 000000000000..949e6721256d
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/fpu_syscall.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2015, Cyril Bur, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * This test attempts to see if the FPU registers change across a syscall (fork).
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+
+#include "utils.h"
+
+extern int test_fpu(double *darray, pid_t *pid);
+
+double darray[] = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
+		     1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0,
+		     2.1};
+
+int syscall_fpu(void)
+{
+	pid_t fork_pid;
+	int i;
+	int ret;
+	int child_ret;
+	for (i = 0; i < 1000; i++) {
+		/* test_fpu will fork() */
+		ret = test_fpu(darray, &fork_pid);
+		if (fork_pid == -1)
+			return -1;
+		if (fork_pid == 0)
+			exit(ret);
+		waitpid(fork_pid, &child_ret, 0);
+		if (ret || child_ret)
+			return 1;
+	}
+
+	return 0;
+}
+
+int test_syscall_fpu(void)
+{
+	/*
+	 * Setup an environment with much context switching
+	 */
+	pid_t pid2;
+	pid_t pid = fork();
+	int ret;
+	int child_ret;
+	FAIL_IF(pid == -1);
+
+	pid2 = fork();
+	/* Can't FAIL_IF(pid2 == -1); because already forked once */
+	if (pid2 == -1) {
+		/*
+		 * Couldn't fork, ensure test is a fail
+		 */
+		child_ret = ret = 1;
+	} else {
+		ret = syscall_fpu();
+		if (pid2)
+			waitpid(pid2, &child_ret, 0);
+		else
+			exit(ret);
+	}
+
+	ret |= child_ret;
+
+	if (pid)
+		waitpid(pid, &child_ret, 0);
+	else
+		exit(ret);
+
+	FAIL_IF(ret || child_ret);
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	return test_harness(test_syscall_fpu, "syscall_fpu");
+
+}
diff --git a/tools/testing/selftests/powerpc/math/vmx_asm.S b/tools/testing/selftests/powerpc/math/vmx_asm.S
new file mode 100644
index 000000000000..1b8c248b3ac1
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/vmx_asm.S
@@ -0,0 +1,235 @@
+/*
+ * Copyright 2015, Cyril Bur, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include "../basic_asm.h"
+
+# POS MUST BE 16 ALIGNED!
+#define PUSH_VMX(pos,reg) \
+	li	reg,pos; \
+	stvx	v20,reg,sp; \
+	addi	reg,reg,16; \
+	stvx	v21,reg,sp; \
+	addi	reg,reg,16; \
+	stvx	v22,reg,sp; \
+	addi	reg,reg,16; \
+	stvx	v23,reg,sp; \
+	addi	reg,reg,16; \
+	stvx	v24,reg,sp; \
+	addi	reg,reg,16; \
+	stvx	v25,reg,sp; \
+	addi	reg,reg,16; \
+	stvx	v26,reg,sp; \
+	addi	reg,reg,16; \
+	stvx	v27,reg,sp; \
+	addi	reg,reg,16; \
+	stvx	v28,reg,sp; \
+	addi	reg,reg,16; \
+	stvx	v29,reg,sp; \
+	addi	reg,reg,16; \
+	stvx	v30,reg,sp; \
+	addi	reg,reg,16; \
+	stvx	v31,reg,sp;
+
+# POS MUST BE 16 ALIGNED!
+#define POP_VMX(pos,reg) \
+	li	reg,pos; \
+	lvx	v20,reg,sp; \
+	addi	reg,reg,16; \
+	lvx	v21,reg,sp; \
+	addi	reg,reg,16; \
+	lvx	v22,reg,sp; \
+	addi	reg,reg,16; \
+	lvx	v23,reg,sp; \
+	addi	reg,reg,16; \
+	lvx	v24,reg,sp; \
+	addi	reg,reg,16; \
+	lvx	v25,reg,sp; \
+	addi	reg,reg,16; \
+	lvx	v26,reg,sp; \
+	addi	reg,reg,16; \
+	lvx	v27,reg,sp; \
+	addi	reg,reg,16; \
+	lvx	v28,reg,sp; \
+	addi	reg,reg,16; \
+	lvx	v29,reg,sp; \
+	addi	reg,reg,16; \
+	lvx	v30,reg,sp; \
+	addi	reg,reg,16; \
+	lvx	v31,reg,sp;
+
+# Carefull this will 'clobber' vmx (by design)
+# Don't call this from C
+FUNC_START(load_vmx)
+	li	r5,0
+	lvx	v20,r5,r3
+	addi	r5,r5,16
+	lvx	v21,r5,r3
+	addi	r5,r5,16
+	lvx	v22,r5,r3
+	addi	r5,r5,16
+	lvx	v23,r5,r3
+	addi	r5,r5,16
+	lvx	v24,r5,r3
+	addi	r5,r5,16
+	lvx	v25,r5,r3
+	addi	r5,r5,16
+	lvx	v26,r5,r3
+	addi	r5,r5,16
+	lvx	v27,r5,r3
+	addi	r5,r5,16
+	lvx	v28,r5,r3
+	addi	r5,r5,16
+	lvx	v29,r5,r3
+	addi	r5,r5,16
+	lvx	v30,r5,r3
+	addi	r5,r5,16
+	lvx	v31,r5,r3
+	blr
+FUNC_END(load_vmx)
+
+# Should be safe from C, only touches r4, r5 and v0,v1,v2
+FUNC_START(check_vmx)
+	PUSH_BASIC_STACK(32)
+	mr r4,r3
+	li	r3,1 # assume a bad result
+	li	r5,0
+	lvx	v0,r5,r4
+	vcmpequd.	v1,v0,v20
+	vmr	v2,v1
+
+	addi	r5,r5,16
+	lvx	v0,r5,r4
+	vcmpequd.	v1,v0,v21
+	vand	v2,v2,v1
+
+	addi	r5,r5,16
+	lvx	v0,r5,r4
+	vcmpequd.	v1,v0,v22
+	vand	v2,v2,v1
+
+	addi	r5,r5,16
+	lvx	v0,r5,r4
+	vcmpequd.	v1,v0,v23
+	vand	v2,v2,v1
+
+	addi	r5,r5,16
+	lvx	v0,r5,r4
+	vcmpequd.	v1,v0,v24
+	vand	v2,v2,v1
+
+	addi	r5,r5,16
+	lvx	v0,r5,r4
+	vcmpequd.	v1,v0,v25
+	vand	v2,v2,v1
+
+	addi	r5,r5,16
+	lvx	v0,r5,r4
+	vcmpequd.	v1,v0,v26
+	vand	v2,v2,v1
+
+	addi	r5,r5,16
+	lvx	v0,r5,r4
+	vcmpequd.	v1,v0,v27
+	vand	v2,v2,v1
+
+	addi	r5,r5,16
+	lvx	v0,r5,r4
+	vcmpequd.	v1,v0,v28
+	vand	v2,v2,v1
+
+	addi	r5,r5,16
+	lvx	v0,r5,r4
+	vcmpequd.	v1,v0,v29
+	vand	v2,v2,v1
+
+	addi	r5,r5,16
+	lvx	v0,r5,r4
+	vcmpequd.	v1,v0,v30
+	vand	v2,v2,v1
+
+	addi	r5,r5,16
+	lvx	v0,r5,r4
+	vcmpequd.	v1,v0,v31
+	vand	v2,v2,v1
+
+	li	r5,STACK_FRAME_LOCAL(0,0)
+	stvx	v2,r5,sp
+	ldx	r0,r5,sp
+	cmpdi	r0,0xffffffffffffffff
+	bne	1f
+	li	r3,0
+1:	POP_BASIC_STACK(32)
+	blr
+FUNC_END(check_vmx)
+
+# Safe from C
+FUNC_START(test_vmx)
+	# r3 holds pointer to where to put the result of fork
+	# r4 holds pointer to the pid
+	# v20-v31 are non-volatile
+	PUSH_BASIC_STACK(512)
+	std	r3,STACK_FRAME_PARAM(0)(sp) # Address of varray
+	std r4,STACK_FRAME_PARAM(1)(sp) # address of pid
+	PUSH_VMX(STACK_FRAME_LOCAL(2,0),r4)
+
+	bl load_vmx
+	nop
+
+	li	r0,__NR_fork
+	sc
+	# Pass the result of fork back to the caller
+	ld	r9,STACK_FRAME_PARAM(1)(sp)
+	std	r3,0(r9)
+
+	ld r3,STACK_FRAME_PARAM(0)(sp)
+	bl check_vmx
+	nop
+
+	POP_VMX(STACK_FRAME_LOCAL(2,0),r4)
+	POP_BASIC_STACK(512)
+	blr
+FUNC_END(test_vmx)
+
+# int preempt_vmx(vector int *varray, int *threads_starting, int *running)
+# On starting will (atomically) decrement threads_starting as a signal that
+# the VMX have been loaded with varray. Will proceed to check the validity of
+# the VMX registers while running is not zero.
+FUNC_START(preempt_vmx)
+	PUSH_BASIC_STACK(512)
+	std r3,STACK_FRAME_PARAM(0)(sp) # vector int *varray
+	std r4,STACK_FRAME_PARAM(1)(sp) # int *threads_starting
+	std r5,STACK_FRAME_PARAM(2)(sp) # int *running
+	# VMX need to write to 16 byte aligned addresses, skip STACK_FRAME_LOCAL(3,0)
+	PUSH_VMX(STACK_FRAME_LOCAL(4,0),r4)
+
+	bl load_vmx
+	nop
+
+	sync
+	# Atomic DEC
+	ld r3,STACK_FRAME_PARAM(1)(sp)
+1:	lwarx r4,0,r3
+	addi r4,r4,-1
+	stwcx. r4,0,r3
+	bne- 1b
+
+2:	ld r3,STACK_FRAME_PARAM(0)(sp)
+	bl check_vmx
+	nop
+	cmpdi r3,0
+	bne 3f
+	ld r4,STACK_FRAME_PARAM(2)(sp)
+	ld r5,0(r4)
+	cmpwi r5,0
+	bne 2b
+
+3:	POP_VMX(STACK_FRAME_LOCAL(4,0),r4)
+	POP_BASIC_STACK(512)
+	blr
+FUNC_END(preempt_vmx)
diff --git a/tools/testing/selftests/powerpc/math/vmx_preempt.c b/tools/testing/selftests/powerpc/math/vmx_preempt.c
new file mode 100644
index 000000000000..9ef376c55b13
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/vmx_preempt.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright 2015, Cyril Bur, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * This test attempts to see if the VMX registers change across preemption.
+ * Two things should be noted here a) The check_vmx function in asm only checks
+ * the non volatile registers as it is reused from the syscall test b) There is
+ * no way to be sure preemption happened so this test just uses many threads
+ * and a long wait. As such, a successful test doesn't mean much but a failure
+ * is bad.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <pthread.h>
+
+#include "utils.h"
+
+/* Time to wait for workers to get preempted (seconds) */
+#define PREEMPT_TIME 20
+/*
+ * Factor by which to multiply number of online CPUs for total number of
+ * worker threads
+ */
+#define THREAD_FACTOR 8
+
+__thread vector int varray[] = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10,11,12},
+	{13,14,15,16},{17,18,19,20},{21,22,23,24},
+	{25,26,27,28},{29,30,31,32},{33,34,35,36},
+	{37,38,39,40},{41,42,43,44},{45,46,47,48}};
+
+int threads_starting;
+int running;
+
+extern void preempt_vmx(vector int *varray, int *threads_starting, int *running);
+
+void *preempt_vmx_c(void *p)
+{
+	int i, j;
+	srand(pthread_self());
+	for (i = 0; i < 12; i++)
+		for (j = 0; j < 4; j++)
+			varray[i][j] = rand();
+
+	/* Test fails if it ever returns */
+	preempt_vmx(varray, &threads_starting, &running);
+	return p;
+}
+
+int test_preempt_vmx(void)
+{
+	int i, rc, threads;
+	pthread_t *tids;
+
+	threads = sysconf(_SC_NPROCESSORS_ONLN) * THREAD_FACTOR;
+	tids = malloc(threads * sizeof(pthread_t));
+	FAIL_IF(!tids);
+
+	running = true;
+	threads_starting = threads;
+	for (i = 0; i < threads; i++) {
+		rc = pthread_create(&tids[i], NULL, preempt_vmx_c, NULL);
+		FAIL_IF(rc);
+	}
+
+	setbuf(stdout, NULL);
+	/* Not really nessesary but nice to wait for every thread to start */
+	printf("\tWaiting for all workers to start...");
+	while(threads_starting)
+		asm volatile("": : :"memory");
+	printf("done\n");
+
+	printf("\tWaiting for %d seconds to let some workers get preempted...", PREEMPT_TIME);
+	sleep(PREEMPT_TIME);
+	printf("done\n");
+
+	printf("\tStopping workers...");
+	/*
+	 * Working are checking this value every loop. In preempt_vmx 'cmpwi r5,0; bne 2b'.
+	 * r5 will have loaded the value of running.
+	 */
+	running = 0;
+	for (i = 0; i < threads; i++) {
+		void *rc_p;
+		pthread_join(tids[i], &rc_p);
+
+		/*
+		 * Harness will say the fail was here, look at why preempt_vmx
+		 * returned
+		 */
+		if ((long) rc_p)
+			printf("oops\n");
+		FAIL_IF((long) rc_p);
+	}
+	printf("done\n");
+
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	return test_harness(test_preempt_vmx, "vmx_preempt");
+}
diff --git a/tools/testing/selftests/powerpc/math/vmx_signal.c b/tools/testing/selftests/powerpc/math/vmx_signal.c
new file mode 100644
index 000000000000..671d7533a557
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/vmx_signal.c
@@ -0,0 +1,156 @@
+/*
+ * Copyright 2015, Cyril Bur, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * This test attempts to see if the VMX registers are correctly reported in a
+ * signal context. Each worker just spins checking its VMX registers, at some
+ * point a signal will interrupt it and C code will check the signal context
+ * ensuring it is also the same.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+#include <altivec.h>
+
+#include "utils.h"
+
+/* Number of times each thread should receive the signal */
+#define ITERATIONS 10
+/*
+ * Factor by which to multiply number of online CPUs for total number of
+ * worker threads
+ */
+#define THREAD_FACTOR 8
+
+__thread vector int varray[] = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10,11,12},
+	{13,14,15,16},{17,18,19,20},{21,22,23,24},
+	{25,26,27,28},{29,30,31,32},{33,34,35,36},
+	{37,38,39,40},{41,42,43,44},{45,46,47,48}};
+
+bool bad_context;
+int running;
+int threads_starting;
+
+extern int preempt_vmx(vector int *varray, int *threads_starting, int *sentinal);
+
+void signal_vmx_sig(int sig, siginfo_t *info, void *context)
+{
+	int i;
+	ucontext_t *uc = context;
+	mcontext_t *mc = &uc->uc_mcontext;
+
+	/* Only the non volatiles were loaded up */
+	for (i = 20; i < 32; i++) {
+		if (memcmp(mc->v_regs->vrregs[i], &varray[i - 20], 16)) {
+			int j;
+			/*
+			 * Shouldn't printf() in a signal handler, however, this is a
+			 * test and we've detected failure. Understanding what failed
+			 * is paramount. All that happens after this is tests exit with
+			 * failure.
+			 */
+			printf("VMX mismatch at reg %d!\n", i);
+			printf("Reg | Actual                  | Expected\n");
+			for (j = 20; j < 32; j++) {
+				printf("%d  | 0x%04x%04x%04x%04x      | 0x%04x%04x%04x%04x\n", j, mc->v_regs->vrregs[j][0],
+					   mc->v_regs->vrregs[j][1], mc->v_regs->vrregs[j][2], mc->v_regs->vrregs[j][3],
+					   varray[j - 20][0], varray[j - 20][1], varray[j - 20][2], varray[j - 20][3]);
+			}
+			bad_context = true;
+			break;
+		}
+	}
+}
+
+void *signal_vmx_c(void *p)
+{
+	int i, j;
+	long rc;
+	struct sigaction act;
+	act.sa_sigaction = signal_vmx_sig;
+	act.sa_flags = SA_SIGINFO;
+	rc = sigaction(SIGUSR1, &act, NULL);
+	if (rc)
+		return p;
+
+	srand(pthread_self());
+	for (i = 0; i < 12; i++)
+		for (j = 0; j < 4; j++)
+			varray[i][j] = rand();
+
+	rc = preempt_vmx(varray, &threads_starting, &running);
+
+	return (void *) rc;
+}
+
+int test_signal_vmx(void)
+{
+	int i, j, rc, threads;
+	void *rc_p;
+	pthread_t *tids;
+
+	threads = sysconf(_SC_NPROCESSORS_ONLN) * THREAD_FACTOR;
+	tids = malloc(threads * sizeof(pthread_t));
+	FAIL_IF(!tids);
+
+	running = true;
+	threads_starting = threads;
+	for (i = 0; i < threads; i++) {
+		rc = pthread_create(&tids[i], NULL, signal_vmx_c, NULL);
+		FAIL_IF(rc);
+	}
+
+	setbuf(stdout, NULL);
+	printf("\tWaiting for %d workers to start... %d", threads, threads_starting);
+	while (threads_starting) {
+		asm volatile("": : :"memory");
+		usleep(1000);
+		printf(", %d", threads_starting);
+	}
+	printf(" ...done\n");
+
+	printf("\tSending signals to all threads %d times...", ITERATIONS);
+	for (i = 0; i < ITERATIONS; i++) {
+		for (j = 0; j < threads; j++) {
+			pthread_kill(tids[j], SIGUSR1);
+		}
+		sleep(1);
+	}
+	printf("done\n");
+
+	printf("\tKilling workers...");
+	running = 0;
+	for (i = 0; i < threads; i++) {
+		pthread_join(tids[i], &rc_p);
+
+		/*
+		 * Harness will say the fail was here, look at why signal_vmx
+		 * returned
+		 */
+		if ((long) rc_p || bad_context)
+			printf("oops\n");
+		if (bad_context)
+			fprintf(stderr, "\t!! bad_context is true\n");
+		FAIL_IF((long) rc_p || bad_context);
+	}
+	printf("done\n");
+
+	free(tids);
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	return test_harness(test_signal_vmx, "vmx_signal");
+}
diff --git a/tools/testing/selftests/powerpc/math/vmx_syscall.c b/tools/testing/selftests/powerpc/math/vmx_syscall.c
new file mode 100644
index 000000000000..a017918ee1ca
--- /dev/null
+++ b/tools/testing/selftests/powerpc/math/vmx_syscall.c
@@ -0,0 +1,91 @@
+/*
+ * Copyright 2015, Cyril Bur, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * This test attempts to see if the VMX registers change across a syscall (fork).
+ */
+
+#include <altivec.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include "utils.h"
+
+vector int varray[] = {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10,11,12},
+	{13,14,15,16},{17,18,19,20},{21,22,23,24},
+	{25,26,27,28},{29,30,31,32},{33,34,35,36},
+	{37,38,39,40},{41,42,43,44},{45,46,47,48}};
+
+extern int test_vmx(vector int *varray, pid_t *pid);
+
+int vmx_syscall(void)
+{
+	pid_t fork_pid;
+	int i;
+	int ret;
+	int child_ret;
+	for (i = 0; i < 1000; i++) {
+		/* test_vmx will fork() */
+		ret = test_vmx(varray, &fork_pid);
+		if (fork_pid == -1)
+			return -1;
+		if (fork_pid == 0)
+			exit(ret);
+		waitpid(fork_pid, &child_ret, 0);
+		if (ret || child_ret)
+			return 1;
+	}
+
+	return 0;
+}
+
+int test_vmx_syscall(void)
+{
+	/*
+	 * Setup an environment with much context switching
+	 */
+	pid_t pid2;
+	pid_t pid = fork();
+	int ret;
+	int child_ret;
+	FAIL_IF(pid == -1);
+
+	pid2 = fork();
+	ret = vmx_syscall();
+	/* Can't FAIL_IF(pid2 == -1); because we've already forked */
+	if (pid2 == -1) {
+		/*
+		 * Couldn't fork, ensure child_ret is set and is a fail
+		 */
+		ret = child_ret = 1;
+	} else {
+		if (pid2)
+			waitpid(pid2, &child_ret, 0);
+		else
+			exit(ret);
+	}
+
+	ret |= child_ret;
+
+	if (pid)
+		waitpid(pid, &child_ret, 0);
+	else
+		exit(ret);
+
+	FAIL_IF(ret || child_ret);
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	return test_harness(test_vmx_syscall, "vmx_syscall");
+
+}
diff --git a/tools/testing/selftests/powerpc/tm/tm-signal-msr-resv.c b/tools/testing/selftests/powerpc/tm/tm-signal-msr-resv.c
index d86653f282b1..8c54d18b3e9a 100644
--- a/tools/testing/selftests/powerpc/tm/tm-signal-msr-resv.c
+++ b/tools/testing/selftests/powerpc/tm/tm-signal-msr-resv.c
@@ -40,7 +40,7 @@ void signal_usr1(int signum, siginfo_t *info, void *uc)
 #ifdef __powerpc64__
 	ucp->uc_mcontext.gp_regs[PT_MSR] |= (7ULL << 32);
 #else
-	ucp->uc_mcontext.regs->gpr[PT_MSR] |= (7ULL);
+	ucp->uc_mcontext.uc_regs->gregs[PT_MSR] |= (7ULL);
 #endif
 	/* Should segv on return becuase of invalid context */
 	segv_expected = 1;