From c02607aad2f9ed478eb288bcec1c00cd9df38b3c Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Mon, 29 Sep 2014 10:05:06 -0600
Subject: iommu: introduce domain attribute for nesting IOMMUs

Some IOMMUs, such as the ARM SMMU, support two stages of translation.
The idea behind such a scheme is to allow a guest operating system to
use the IOMMU for DMA mappings in the first stage of translation, with
the hypervisor then installing mappings in the second stage to provide
isolation of the DMA to the physical range assigned to that virtual
machine.

In order to allow IOMMU domains to be used for second-stage translation,
this patch adds a new iommu_attr (IOMMU_ATTR_NESTING) for setting
second-stage domains prior to device attach. The attribute can also be
queried to see if a domain is actually making use of nesting.

Acked-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 include/linux/iommu.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 20f9a527922a..7b02bcc85b9e 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -80,6 +80,7 @@ enum iommu_attr {
 	DOMAIN_ATTR_FSL_PAMU_STASH,
 	DOMAIN_ATTR_FSL_PAMU_ENABLE,
 	DOMAIN_ATTR_FSL_PAMUV1,
+	DOMAIN_ATTR_NESTING,	/* two stages of translation */
 	DOMAIN_ATTR_MAX,
 };
 
-- 
cgit v1.2.3-59-g8ed1b


From f5c9ecebaf2a2c9381973798e389cc019dd983e0 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Mon, 29 Sep 2014 10:06:19 -0600
Subject: vfio/iommu_type1: add new VFIO_TYPE1_NESTING_IOMMU IOMMU type

VFIO allows devices to be safely handed off to userspace by putting
them behind an IOMMU configured to ensure DMA and interrupt isolation.
This enables userspace KVM clients, such as kvmtool and qemu, to further
map the device into a virtual machine.

With IOMMUs such as the ARM SMMU, it is then possible to provide SMMU
translation services to the guest operating system, which are nested
with the existing translation installed by VFIO. However, enabling this
feature means that the IOMMU driver must be informed that the VFIO domain
is being created for the purposes of nested translation.

This patch adds a new IOMMU type (VFIO_TYPE1_NESTING_IOMMU) to the VFIO
type-1 driver. The new IOMMU type acts identically to the
VFIO_TYPE1v2_IOMMU type, but additionally sets the DOMAIN_ATTR_NESTING
attribute on its IOMMU domains.

Cc: Joerg Roedel <joro@8bytes.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio_iommu_type1.c | 30 +++++++++++++++++++++++++-----
 include/uapi/linux/vfio.h       |  3 +++
 2 files changed, 28 insertions(+), 5 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 0734fbe5b651..583ccdb2c58f 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -57,7 +57,8 @@ struct vfio_iommu {
 	struct list_head	domain_list;
 	struct mutex		lock;
 	struct rb_root		dma_list;
-	bool v2;
+	bool			v2;
+	bool			nesting;
 };
 
 struct vfio_domain {
@@ -705,6 +706,15 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 		goto out_free;
 	}
 
+	if (iommu->nesting) {
+		int attr = 1;
+
+		ret = iommu_domain_set_attr(domain->domain, DOMAIN_ATTR_NESTING,
+					    &attr);
+		if (ret)
+			goto out_domain;
+	}
+
 	ret = iommu_attach_group(domain->domain, iommu_group);
 	if (ret)
 		goto out_domain;
@@ -819,17 +829,26 @@ static void *vfio_iommu_type1_open(unsigned long arg)
 {
 	struct vfio_iommu *iommu;
 
-	if (arg != VFIO_TYPE1_IOMMU && arg != VFIO_TYPE1v2_IOMMU)
-		return ERR_PTR(-EINVAL);
-
 	iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
 	if (!iommu)
 		return ERR_PTR(-ENOMEM);
 
+	switch (arg) {
+	case VFIO_TYPE1_IOMMU:
+		break;
+	case VFIO_TYPE1_NESTING_IOMMU:
+		iommu->nesting = true;
+	case VFIO_TYPE1v2_IOMMU:
+		iommu->v2 = true;
+		break;
+	default:
+		kfree(iommu);
+		return ERR_PTR(-EINVAL);
+	}
+
 	INIT_LIST_HEAD(&iommu->domain_list);
 	iommu->dma_list = RB_ROOT;
 	mutex_init(&iommu->lock);
-	iommu->v2 = (arg == VFIO_TYPE1v2_IOMMU);
 
 	return iommu;
 }
@@ -885,6 +904,7 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
 		switch (arg) {
 		case VFIO_TYPE1_IOMMU:
 		case VFIO_TYPE1v2_IOMMU:
+		case VFIO_TYPE1_NESTING_IOMMU:
 			return 1;
 		case VFIO_DMA_CC_IOMMU:
 			if (!iommu)
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 6612974c64bf..29715d27548f 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -33,6 +33,9 @@
 /* Check if EEH is supported */
 #define VFIO_EEH			5
 
+/* Two-stage IOMMU */
+#define VFIO_TYPE1_NESTING_IOMMU	6	/* Implies v2 */
+
 /*
  * The IOCTL interface is designed for extensibility by embedding the
  * structure length (argsz) and flags into structures passed between
-- 
cgit v1.2.3-59-g8ed1b


From 3b307ffe3faad60aeda0e9a4f661d5c1edbd761e Mon Sep 17 00:00:00 2001
From: Gavin Shan <gwshan@linux.vnet.ibm.com>
Date: Mon, 29 Sep 2014 10:13:46 -0600
Subject: PCI: Export MSI message relevant functions

The patch exports 2 MSI message relevant functions, which will be
used by VFIO PCI driver. The VFIO PCI driver would be built as
a module.

Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Acked-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/pci/msi.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index 5a40516444f3..e468d65eb238 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -312,6 +312,7 @@ void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg)
 
 	__get_cached_msi_msg(entry, msg);
 }
+EXPORT_SYMBOL_GPL(get_cached_msi_msg);
 
 void __write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
 {
@@ -356,6 +357,7 @@ void write_msi_msg(unsigned int irq, struct msi_msg *msg)
 
 	__write_msi_msg(entry, msg);
 }
+EXPORT_SYMBOL_GPL(write_msi_msg);
 
 static void free_msi_irqs(struct pci_dev *dev)
 {
-- 
cgit v1.2.3-59-g8ed1b


From b8f02af096b1fc9fd46680cbe55214e477eb76d3 Mon Sep 17 00:00:00 2001
From: Gavin Shan <gwshan@linux.vnet.ibm.com>
Date: Mon, 29 Sep 2014 10:16:24 -0600
Subject: vfio/pci: Restore MSIx message prior to enabling

The MSIx vector table lives in device memory, which may be cleared as
part of a backdoor device reset. This is the case on the IBM IPR HBA
when the BIST is run on the device. When assigned to a QEMU guest,
the guest driver does a pci_save_state(), issues a BIST, then does a
pci_restore_state(). The BIST clears the MSIx vector table, but due
to the way interrupts are configured the pci_restore_state() does not
restore the vector table as expected. Eventually this results in an
EEH error on Power platforms when the device attempts to signal an
interrupt with the zero'd table entry.

Fix the problem by restoring the host cached MSI message prior to
enabling each vector.

Reported-by: Wen Xiong <wenxiong@linux.vnet.ibm.com>
Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/pci/vfio_pci_intrs.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c
index 9dd49c9839ac..553212f037c3 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -16,6 +16,7 @@
 #include <linux/device.h>
 #include <linux/interrupt.h>
 #include <linux/eventfd.h>
+#include <linux/msi.h>
 #include <linux/pci.h>
 #include <linux/file.h>
 #include <linux/poll.h>
@@ -548,6 +549,20 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev,
 		return PTR_ERR(trigger);
 	}
 
+	/*
+	 * The MSIx vector table resides in device memory which may be cleared
+	 * via backdoor resets. We don't allow direct access to the vector
+	 * table so even if a userspace driver attempts to save/restore around
+	 * such a reset it would be unsuccessful. To avoid this, restore the
+	 * cached value of the message prior to enabling.
+	 */
+	if (msix) {
+		struct msi_msg msg;
+
+		get_cached_msi_msg(irq, &msg);
+		write_msi_msg(irq, &msg);
+	}
+
 	ret = request_irq(irq, vfio_msihandler, 0,
 			  vdev->ctx[vector].name, trigger);
 	if (ret) {
-- 
cgit v1.2.3-59-g8ed1b


From 0f905ce2b59c666ad48f240bfa2ab28b77f7f936 Mon Sep 17 00:00:00 2001
From: Gavin Shan <gwshan@linux.vnet.ibm.com>
Date: Mon, 29 Sep 2014 10:31:51 -0600
Subject: drivers/vfio: Export vfio_spapr_iommu_eeh_ioctl() with GPL

The function should have been exported with EXPORT_SYMBOL_GPL()
as part of commit 92d18a6851fb ("drivers/vfio: Fix EEH build error").

Suggested-by: Alexey Kardashevskiy <aik@ozlabs.ru>
Signed-off-by: Gavin Shan <gwshan@linux.vnet.ibm.com>
Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
---
 drivers/vfio/vfio_spapr_eeh.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio_spapr_eeh.c b/drivers/vfio/vfio_spapr_eeh.c
index 86dfceb9201f..5fa42db769ee 100644
--- a/drivers/vfio/vfio_spapr_eeh.c
+++ b/drivers/vfio/vfio_spapr_eeh.c
@@ -92,7 +92,7 @@ long vfio_spapr_iommu_eeh_ioctl(struct iommu_group *group,
 
 	return ret;
 }
-EXPORT_SYMBOL(vfio_spapr_iommu_eeh_ioctl);
+EXPORT_SYMBOL_GPL(vfio_spapr_iommu_eeh_ioctl);
 
 MODULE_VERSION(DRIVER_VERSION);
 MODULE_LICENSE("GPL v2");
-- 
cgit v1.2.3-59-g8ed1b


From 93899a679fd6b2534b5c297d9316bae039ebcbe1 Mon Sep 17 00:00:00 2001
From: Alex Williamson <alex.williamson@redhat.com>
Date: Mon, 29 Sep 2014 17:18:39 -0600
Subject: vfio-pci: Fix remove path locking

Locking both the remove() and release() path results in a deadlock
that should have been obvious.  To fix this we can get and hold the
vfio_device reference as we evaluate whether to do a bus/slot reset.
This will automatically block any remove() calls, allowing us to
remove the explict lock.  Fixes 61d792562b53.

Signed-off-by: Alex Williamson <alex.williamson@redhat.com>
Cc: stable@vger.kernel.org	[3.17]
---
 drivers/vfio/pci/vfio_pci.c | 136 +++++++++++++++++++-------------------------
 1 file changed, 57 insertions(+), 79 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index f7825332a325..9558da3f06a0 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -876,15 +876,11 @@ static void vfio_pci_remove(struct pci_dev *pdev)
 {
 	struct vfio_pci_device *vdev;
 
-	mutex_lock(&driver_lock);
-
 	vdev = vfio_del_group_dev(&pdev->dev);
 	if (vdev) {
 		iommu_group_put(pdev->dev.iommu_group);
 		kfree(vdev);
 	}
-
-	mutex_unlock(&driver_lock);
 }
 
 static pci_ers_result_t vfio_pci_aer_err_detected(struct pci_dev *pdev,
@@ -927,108 +923,90 @@ static struct pci_driver vfio_pci_driver = {
 	.err_handler	= &vfio_err_handlers,
 };
 
-/*
- * Test whether a reset is necessary and possible.  We mark devices as
- * needs_reset when they are released, but don't have a function-local reset
- * available.  If any of these exist in the affected devices, we want to do
- * a bus/slot reset.  We also need all of the affected devices to be unused,
- * so we abort if any device has a non-zero refcnt.  driver_lock prevents a
- * device from being opened during the scan or unbound from vfio-pci.
- */
-static int vfio_pci_test_bus_reset(struct pci_dev *pdev, void *data)
-{
-	bool *needs_reset = data;
-	struct pci_driver *pci_drv = ACCESS_ONCE(pdev->driver);
-	int ret = -EBUSY;
-
-	if (pci_drv == &vfio_pci_driver) {
-		struct vfio_device *device;
-		struct vfio_pci_device *vdev;
-
-		device = vfio_device_get_from_dev(&pdev->dev);
-		if (!device)
-			return ret;
-
-		vdev = vfio_device_data(device);
-		if (vdev) {
-			if (vdev->needs_reset)
-				*needs_reset = true;
-
-			if (!vdev->refcnt)
-				ret = 0;
-		}
-
-		vfio_device_put(device);
-	}
-
-	/*
-	 * TODO: vfio-core considers groups to be viable even if some devices
-	 * are attached to known drivers, like pci-stub or pcieport.  We can't
-	 * freeze devices from being unbound to those drivers like we can
-	 * here though, so it would be racy to test for them.  We also can't
-	 * use device_lock() to prevent changes as that would interfere with
-	 * PCI-core taking device_lock during bus reset.  For now, we require
-	 * devices to be bound to vfio-pci to get a bus/slot reset on release.
-	 */
-
-	return ret;
-}
+struct vfio_devices {
+	struct vfio_device **devices;
+	int cur_index;
+	int max_index;
+};
 
-/* Clear needs_reset on all affected devices after successful bus/slot reset */
-static int vfio_pci_clear_needs_reset(struct pci_dev *pdev, void *data)
+static int vfio_pci_get_devs(struct pci_dev *pdev, void *data)
 {
+	struct vfio_devices *devs = data;
 	struct pci_driver *pci_drv = ACCESS_ONCE(pdev->driver);
 
-	if (pci_drv == &vfio_pci_driver) {
-		struct vfio_device *device;
-		struct vfio_pci_device *vdev;
+	if (pci_drv != &vfio_pci_driver)
+		return -EBUSY;
 
-		device = vfio_device_get_from_dev(&pdev->dev);
-		if (!device)
-			return 0;
+	if (devs->cur_index == devs->max_index)
+		return -ENOSPC;
 
-		vdev = vfio_device_data(device);
-		if (vdev)
-			vdev->needs_reset = false;
-
-		vfio_device_put(device);
-	}
+	devs->devices[devs->cur_index] = vfio_device_get_from_dev(&pdev->dev);
+	if (!devs->devices[devs->cur_index])
+		return -EINVAL;
 
+	devs->cur_index++;
 	return 0;
 }
 
 /*
  * Attempt to do a bus/slot reset if there are devices affected by a reset for
  * this device that are needs_reset and all of the affected devices are unused
- * (!refcnt).  Callers of this function are required to hold driver_lock such
- * that devices can not be unbound from vfio-pci or opened by a user while we
- * test for and perform a bus/slot reset.
+ * (!refcnt).  Callers are required to hold driver_lock when calling this to
+ * prevent device opens and concurrent bus reset attempts.  We prevent device
+ * unbinds by acquiring and holding a reference to the vfio_device.
+ *
+ * NB: vfio-core considers a group to be viable even if some devices are
+ * bound to drivers like pci-stub or pcieport.  Here we require all devices
+ * to be bound to vfio_pci since that's the only way we can be sure they
+ * stay put.
  */
 static void vfio_pci_try_bus_reset(struct vfio_pci_device *vdev)
 {
+	struct vfio_devices devs = { .cur_index = 0 };
+	int i = 0, ret = -EINVAL;
 	bool needs_reset = false, slot = false;
-	int ret;
+	struct vfio_pci_device *tmp;
 
 	if (!pci_probe_reset_slot(vdev->pdev->slot))
 		slot = true;
 	else if (pci_probe_reset_bus(vdev->pdev->bus))
 		return;
 
-	if (vfio_pci_for_each_slot_or_bus(vdev->pdev,
-					  vfio_pci_test_bus_reset,
-					  &needs_reset, slot) || !needs_reset)
+	if (vfio_pci_for_each_slot_or_bus(vdev->pdev, vfio_pci_count_devs,
+					  &i, slot) || !i)
 		return;
 
-	if (slot)
-		ret = pci_try_reset_slot(vdev->pdev->slot);
-	else
-		ret = pci_try_reset_bus(vdev->pdev->bus);
-
-	if (ret)
+	devs.max_index = i;
+	devs.devices = kcalloc(i, sizeof(struct vfio_device *), GFP_KERNEL);
+	if (!devs.devices)
 		return;
 
-	vfio_pci_for_each_slot_or_bus(vdev->pdev,
-				      vfio_pci_clear_needs_reset, NULL, slot);
+	if (vfio_pci_for_each_slot_or_bus(vdev->pdev,
+					  vfio_pci_get_devs, &devs, slot))
+		goto put_devs;
+
+	for (i = 0; i < devs.cur_index; i++) {
+		tmp = vfio_device_data(devs.devices[i]);
+		if (tmp->needs_reset)
+			needs_reset = true;
+		if (tmp->refcnt)
+			goto put_devs;
+	}
+
+	if (needs_reset)
+		ret = slot ? pci_try_reset_slot(vdev->pdev->slot) :
+			     pci_try_reset_bus(vdev->pdev->bus);
+
+put_devs:
+	for (i = 0; i < devs.cur_index; i++) {
+		if (!ret) {
+			tmp = vfio_device_data(devs.devices[i]);
+			tmp->needs_reset = false;
+		}
+		vfio_device_put(devs.devices[i]);
+	}
+
+	kfree(devs.devices);
 }
 
 static void __exit vfio_pci_cleanup(void)
-- 
cgit v1.2.3-59-g8ed1b