aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/arch/powerpc/platforms/powernv
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/platforms/powernv')
-rw-r--r--arch/powerpc/platforms/powernv/Makefile2
-rw-r--r--arch/powerpc/platforms/powernv/idle.c2
-rw-r--r--arch/powerpc/platforms/powernv/npu-dma.c117
-rw-r--r--arch/powerpc/platforms/powernv/opal-fadump.c2
-rw-r--r--arch/powerpc/platforms/powernv/opal.c4
-rw-r--r--arch/powerpc/platforms/powernv/pci-ioda-tce.c28
-rw-r--r--arch/powerpc/platforms/powernv/pci-ioda.c299
-rw-r--r--arch/powerpc/platforms/powernv/pci.c20
-rw-r--r--arch/powerpc/platforms/powernv/pci.h28
-rw-r--r--arch/powerpc/platforms/powernv/smp.c1
-rw-r--r--arch/powerpc/platforms/powernv/vas-api.c278
-rw-r--r--arch/powerpc/platforms/powernv/vas-debug.c2
-rw-r--r--arch/powerpc/platforms/powernv/vas-fault.c382
-rw-r--r--arch/powerpc/platforms/powernv/vas-window.c238
-rw-r--r--arch/powerpc/platforms/powernv/vas.c85
-rw-r--r--arch/powerpc/platforms/powernv/vas.h59
16 files changed, 1230 insertions, 317 deletions
diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile
index c0f8120045c3..fe3f0fb5aeca 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -17,7 +17,7 @@ obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o
obj-$(CONFIG_OPAL_PRD) += opal-prd.o
obj-$(CONFIG_PERF_EVENTS) += opal-imc.o
obj-$(CONFIG_PPC_MEMTRACE) += memtrace.o
-obj-$(CONFIG_PPC_VAS) += vas.o vas-window.o vas-debug.o
+obj-$(CONFIG_PPC_VAS) += vas.o vas-window.o vas-debug.o vas-fault.o vas-api.o
obj-$(CONFIG_OCXL_BASE) += ocxl.o
obj-$(CONFIG_SCOM_DEBUGFS) += opal-xscom.o
obj-$(CONFIG_PPC_SECURE_BOOT) += opal-secvar.o
diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index 78599bca66c2..2dd467383a88 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -1270,7 +1270,7 @@ static int pnv_parse_cpuidle_dt(void)
/* Read residencies */
if (of_property_read_u32_array(np, "ibm,cpu-idle-state-residency-ns",
temp_u32, nr_idle_states)) {
- pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-latencies-ns in DT\n");
+ pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-residency-ns in DT\n");
rc = -EINVAL;
goto out;
}
diff --git a/arch/powerpc/platforms/powernv/npu-dma.c b/arch/powerpc/platforms/powernv/npu-dma.c
index b95b9e3c4c98..abeaa533b976 100644
--- a/arch/powerpc/platforms/powernv/npu-dma.c
+++ b/arch/powerpc/platforms/powernv/npu-dma.c
@@ -15,6 +15,7 @@
#include <asm/debugfs.h>
#include <asm/powernv.h>
+#include <asm/ppc-pci.h>
#include <asm/opal.h>
#include "pci.h"
@@ -425,9 +426,10 @@ static void pnv_comp_attach_table_group(struct npu_comp *npucomp,
++npucomp->pe_num;
}
-struct iommu_table_group *pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe)
+static struct iommu_table_group *
+ pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe)
{
- struct iommu_table_group *table_group;
+ struct iommu_table_group *compound_group;
struct npu_comp *npucomp;
struct pci_dev *gpdev = NULL;
struct pci_controller *hose;
@@ -446,39 +448,52 @@ struct iommu_table_group *pnv_try_setup_npu_table_group(struct pnv_ioda_pe *pe)
hose = pci_bus_to_host(npdev->bus);
if (hose->npu) {
- table_group = &hose->npu->npucomp.table_group;
-
- if (!table_group->group) {
- table_group->ops = &pnv_npu_peers_ops;
- iommu_register_group(table_group,
- hose->global_number,
- pe->pe_number);
- }
+ /* P9 case: compound group is per-NPU (all gpus, all links) */
+ npucomp = &hose->npu->npucomp;
} else {
- /* Create a group for 1 GPU and attached NPUs for POWER8 */
- pe->npucomp = kzalloc(sizeof(*pe->npucomp), GFP_KERNEL);
- table_group = &pe->npucomp->table_group;
- table_group->ops = &pnv_npu_peers_ops;
- iommu_register_group(table_group, hose->global_number,
- pe->pe_number);
+ /* P8 case: Compound group is per-GPU (1 gpu, 2 links) */
+ npucomp = pe->npucomp = kzalloc(sizeof(*npucomp), GFP_KERNEL);
}
- /* Steal capabilities from a GPU PE */
- table_group->max_dynamic_windows_supported =
- pe->table_group.max_dynamic_windows_supported;
- table_group->tce32_start = pe->table_group.tce32_start;
- table_group->tce32_size = pe->table_group.tce32_size;
- table_group->max_levels = pe->table_group.max_levels;
- if (!table_group->pgsizes)
- table_group->pgsizes = pe->table_group.pgsizes;
+ compound_group = &npucomp->table_group;
+ if (!compound_group->group) {
+ compound_group->ops = &pnv_npu_peers_ops;
+ iommu_register_group(compound_group, hose->global_number,
+ pe->pe_number);
- npucomp = container_of(table_group, struct npu_comp, table_group);
+ /* Steal capabilities from a GPU PE */
+ compound_group->max_dynamic_windows_supported =
+ pe->table_group.max_dynamic_windows_supported;
+ compound_group->tce32_start = pe->table_group.tce32_start;
+ compound_group->tce32_size = pe->table_group.tce32_size;
+ compound_group->max_levels = pe->table_group.max_levels;
+ if (!compound_group->pgsizes)
+ compound_group->pgsizes = pe->table_group.pgsizes;
+ }
+
+ /*
+ * The gpu would have been added to the iommu group that's created
+ * for the PE. Pull it out now.
+ */
+ iommu_del_device(&gpdev->dev);
+
+ /*
+ * I'm not sure this is strictly required, but it's probably a good idea
+ * since the table_group for the PE is going to be attached to the
+ * compound table group. If we leave the PE's iommu group active then
+ * we might have the same table_group being modifiable via two sepeate
+ * iommu groups.
+ */
+ iommu_group_put(pe->table_group.group);
+
+ /* now put the GPU into the compound group */
pnv_comp_attach_table_group(npucomp, pe);
+ iommu_add_device(compound_group, &gpdev->dev);
- return table_group;
+ return compound_group;
}
-struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe *pe)
+static struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe *pe)
{
struct iommu_table_group *table_group;
struct npu_comp *npucomp;
@@ -521,6 +536,54 @@ struct iommu_table_group *pnv_npu_compound_attach(struct pnv_ioda_pe *pe)
return table_group;
}
+
+void pnv_pci_npu_setup_iommu_groups(void)
+{
+ struct pci_controller *hose;
+ struct pnv_phb *phb;
+ struct pnv_ioda_pe *pe;
+
+ /*
+ * For non-nvlink devices the IOMMU group is registered when the PE is
+ * configured and devices are added to the group when the per-device
+ * DMA setup is run. That's done in hose->ops.dma_dev_setup() which is
+ * only initialise for "normal" IODA PHBs.
+ *
+ * For NVLink devices we need to ensure the NVLinks and the GPU end up
+ * in the same IOMMU group, so that's handled here.
+ */
+ list_for_each_entry(hose, &hose_list, list_node) {
+ phb = hose->private_data;
+
+ if (phb->type == PNV_PHB_IODA2)
+ list_for_each_entry(pe, &phb->ioda.pe_list, list)
+ pnv_try_setup_npu_table_group(pe);
+ }
+
+ /*
+ * Now we have all PHBs discovered, time to add NPU devices to
+ * the corresponding IOMMU groups.
+ */
+ list_for_each_entry(hose, &hose_list, list_node) {
+ unsigned long pgsizes;
+
+ phb = hose->private_data;
+
+ if (phb->type != PNV_PHB_NPU_NVLINK)
+ continue;
+
+ pgsizes = pnv_ioda_parse_tce_sizes(phb);
+ list_for_each_entry(pe, &phb->ioda.pe_list, list) {
+ /*
+ * IODA2 bridges get this set up from
+ * pci_controller_ops::setup_bridge but NPU bridges
+ * do not have this hook defined so we do it here.
+ */
+ pe->table_group.pgsizes = pgsizes;
+ pnv_npu_compound_attach(pe);
+ }
+ }
+}
#endif /* CONFIG_IOMMU_API */
int pnv_npu2_init(struct pci_controller *hose)
diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c
index d361d37d975f..9a360ced663b 100644
--- a/arch/powerpc/platforms/powernv/opal-fadump.c
+++ b/arch/powerpc/platforms/powernv/opal-fadump.c
@@ -671,7 +671,7 @@ void __init opal_fadump_dt_scan(struct fw_dump *fadump_conf, u64 node)
* Firmware supports 32-bit field for size. Align it to PAGE_SIZE
* and request firmware to copy multiple kernel boot memory regions.
*/
- fadump_conf->max_copy_size = _ALIGN_DOWN(U32_MAX, PAGE_SIZE);
+ fadump_conf->max_copy_size = ALIGN_DOWN(U32_MAX, PAGE_SIZE);
/*
* Check if dump has been initiated on last reboot.
diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c
index 2b3dfd0b6cdd..d95954ad4c0a 100644
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -811,6 +811,10 @@ static int opal_add_one_export(struct kobject *parent, const char *export_name,
goto out;
attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+ if (!attr) {
+ rc = -ENOMEM;
+ goto out;
+ }
name = kstrdup(export_name, GFP_KERNEL);
if (!name) {
rc = -ENOMEM;
diff --git a/arch/powerpc/platforms/powernv/pci-ioda-tce.c b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
index 5dc6847d5f4c..f923359d8afc 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda-tce.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda-tce.c
@@ -17,6 +17,34 @@
#include <asm/tce.h>
#include "pci.h"
+unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb)
+{
+ struct pci_controller *hose = phb->hose;
+ struct device_node *dn = hose->dn;
+ unsigned long mask = 0;
+ int i, rc, count;
+ u32 val;
+
+ count = of_property_count_u32_elems(dn, "ibm,supported-tce-sizes");
+ if (count <= 0) {
+ mask = SZ_4K | SZ_64K;
+ /* Add 16M for POWER8 by default */
+ if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
+ !cpu_has_feature(CPU_FTR_ARCH_300))
+ mask |= SZ_16M | SZ_256M;
+ return mask;
+ }
+
+ for (i = 0; i < count; i++) {
+ rc = of_property_read_u32_index(dn, "ibm,supported-tce-sizes",
+ i, &val);
+ if (rc == 0)
+ mask |= 1ULL << val;
+ }
+
+ return mask;
+}
+
void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
void *tce_mem, u64 tce_size,
u64 dma_offset, unsigned int page_shift)
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c
index 57d3a6af1d52..73a63efcf855 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -51,6 +51,7 @@ static const char * const pnv_phb_names[] = { "IODA1", "IODA2", "NPU_NVLINK",
"NPU_OCAPI" };
static void pnv_pci_ioda2_set_bypass(struct pnv_ioda_pe *pe, bool enable);
+static void pnv_pci_configure_bus(struct pci_bus *bus);
void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
const char *fmt, ...)
@@ -264,8 +265,8 @@ static void pnv_ioda_reserve_dev_m64_pe(struct pci_dev *pdev,
if (!r->parent || !pnv_pci_is_m64(phb, r))
continue;
- start = _ALIGN_DOWN(r->start - base, sgsz);
- end = _ALIGN_UP(r->end - base, sgsz);
+ start = ALIGN_DOWN(r->start - base, sgsz);
+ end = ALIGN(r->end - base, sgsz);
for (segno = start / sgsz; segno < end / sgsz; segno++) {
if (pe_bitmap)
set_bit(segno, pe_bitmap);
@@ -361,7 +362,7 @@ static struct pnv_ioda_pe *pnv_ioda_pick_m64_pe(struct pci_bus *bus, bool all)
return NULL;
/* Allocate bitmap */
- size = _ALIGN_UP(phb->ioda.total_pe_num / 8, sizeof(unsigned long));
+ size = ALIGN(phb->ioda.total_pe_num / 8, sizeof(unsigned long));
pe_alloc = kzalloc(size, GFP_KERNEL);
if (!pe_alloc) {
pr_warn("%s: Out of memory !\n",
@@ -660,6 +661,16 @@ static int pnv_ioda_get_pe_state(struct pnv_phb *phb, int pe_no)
return state;
}
+struct pnv_ioda_pe *pnv_pci_bdfn_to_pe(struct pnv_phb *phb, u16 bdfn)
+{
+ int pe_number = phb->ioda.pe_rmap[bdfn];
+
+ if (pe_number == IODA_INVALID_PE)
+ return NULL;
+
+ return &phb->ioda.pe_array[pe_number];
+}
+
struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev)
{
struct pci_controller *hose = pci_bus_to_host(dev->bus);
@@ -1110,34 +1121,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_dev_PE(struct pci_dev *dev)
return pe;
}
-static void pnv_ioda_setup_same_PE(struct pci_bus *bus, struct pnv_ioda_pe *pe)
-{
- struct pci_dev *dev;
-
- list_for_each_entry(dev, &bus->devices, bus_list) {
- struct pci_dn *pdn = pci_get_pdn(dev);
-
- if (pdn == NULL) {
- pr_warn("%s: No device node associated with device !\n",
- pci_name(dev));
- continue;
- }
-
- /*
- * In partial hotplug case, the PCI device might be still
- * associated with the PE and needn't attach it to the PE
- * again.
- */
- if (pdn->pe_number != IODA_INVALID_PE)
- continue;
-
- pe->device_count++;
- pdn->pe_number = pe->pe_number;
- if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
- pnv_ioda_setup_same_PE(dev->subordinate, pe);
- }
-}
-
/*
* There're 2 types of PCI bus sensitive PEs: One that is compromised of
* single PCI bus. Another one that contains the primary PCI bus and its
@@ -1156,15 +1139,13 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
* We should reuse it instead of allocating a new one.
*/
pe_num = phb->ioda.pe_rmap[bus->number << 8];
- if (pe_num != IODA_INVALID_PE) {
+ if (WARN_ON(pe_num != IODA_INVALID_PE)) {
pe = &phb->ioda.pe_array[pe_num];
- pnv_ioda_setup_same_PE(bus, pe);
return NULL;
}
/* PE number for root bus should have been reserved */
- if (pci_is_root_bus(bus) &&
- phb->ioda.root_pe_idx != IODA_INVALID_PE)
+ if (pci_is_root_bus(bus))
pe = &phb->ioda.pe_array[phb->ioda.root_pe_idx];
/* Check if PE is determined by M64 */
@@ -1202,9 +1183,6 @@ static struct pnv_ioda_pe *pnv_ioda_setup_bus_PE(struct pci_bus *bus, bool all)
return NULL;
}
- /* Associate it with all child devices */
- pnv_ioda_setup_same_PE(bus, pe);
-
/* Put PE to the list */
list_add_tail(&pe->list, &phb->ioda.pe_list);
@@ -1288,7 +1266,7 @@ static void pnv_ioda_setup_npu_PEs(struct pci_bus *bus)
pnv_ioda_setup_npu_PE(pdev);
}
-static void pnv_pci_ioda_setup_PEs(void)
+static void pnv_pci_ioda_setup_nvlink(void)
{
struct pci_controller *hose;
struct pnv_phb *phb;
@@ -1312,6 +1290,11 @@ static void pnv_pci_ioda_setup_PEs(void)
list_for_each_entry(pe, &phb->ioda.pe_list, list)
pnv_npu2_map_lpar(pe, MSR_DR | MSR_PR | MSR_HV);
}
+
+#ifdef CONFIG_IOMMU_API
+ /* setup iommu groups so we can do nvlink pass-thru */
+ pnv_pci_npu_setup_iommu_groups();
+#endif
}
#ifdef CONFIG_PCI_IOV
@@ -1550,11 +1533,6 @@ void pnv_pci_sriov_disable(struct pci_dev *pdev)
static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
struct pnv_ioda_pe *pe);
-#ifdef CONFIG_IOMMU_API
-static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe,
- struct iommu_table_group *table_group, struct pci_bus *bus);
-
-#endif
static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
{
struct pci_bus *bus;
@@ -1619,11 +1597,6 @@ static void pnv_ioda_setup_vf_PE(struct pci_dev *pdev, u16 num_vfs)
}
pnv_pci_ioda2_setup_dma_pe(phb, pe);
-#ifdef CONFIG_IOMMU_API
- iommu_register_group(&pe->table_group,
- pe->phb->hose->global_number, pe->pe_number);
- pnv_ioda_setup_bus_iommu_group(pe, &pe->table_group, NULL);
-#endif
}
}
@@ -1767,24 +1740,39 @@ static void pnv_pci_ioda_dma_dev_setup(struct pci_dev *pdev)
struct pci_dn *pdn = pci_get_pdn(pdev);
struct pnv_ioda_pe *pe;
- /*
- * The function can be called while the PE#
- * hasn't been assigned. Do nothing for the
- * case.
- */
- if (!pdn || pdn->pe_number == IODA_INVALID_PE)
- return;
+ /* Check if the BDFN for this device is associated with a PE yet */
+ pe = pnv_pci_bdfn_to_pe(phb, pdev->devfn | (pdev->bus->number << 8));
+ if (!pe) {
+ /* VF PEs should be pre-configured in pnv_pci_sriov_enable() */
+ if (WARN_ON(pdev->is_virtfn))
+ return;
+
+ pnv_pci_configure_bus(pdev->bus);
+ pe = pnv_pci_bdfn_to_pe(phb, pdev->devfn | (pdev->bus->number << 8));
+ pci_info(pdev, "Configured PE#%x\n", pe ? pe->pe_number : 0xfffff);
+
+
+ /*
+ * If we can't setup the IODA PE something has gone horribly
+ * wrong and we can't enable DMA for the device.
+ */
+ if (WARN_ON(!pe))
+ return;
+ } else {
+ pci_info(pdev, "Added to existing PE#%x\n", pe->pe_number);
+ }
+
+ if (pdn)
+ pdn->pe_number = pe->pe_number;
+ pe->device_count++;
- pe = &phb->ioda.pe_array[pdn->pe_number];
WARN_ON(get_dma_ops(&pdev->dev) != &dma_iommu_ops);
pdev->dev.archdata.dma_offset = pe->tce_bypass_base;
set_iommu_table_base(&pdev->dev, pe->table_group.tables[0]);
- /*
- * Note: iommu_add_device() will fail here as
- * for physical PE: the device is already added by now;
- * for virtual PE: sysfs entries are not ready yet and
- * tce_iommu_bus_notifier will add the device to a group later.
- */
+
+ /* PEs with a DMA weight of zero won't have a group */
+ if (pe->table_group.group)
+ iommu_add_device(&pe->table_group, &pdev->dev);
}
/*
@@ -2297,9 +2285,6 @@ found:
pe->table_group.tce32_size = tbl->it_size << tbl->it_page_shift;
iommu_init_table(tbl, phb->hose->node, 0, 0);
- if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
- pnv_ioda_setup_bus_dma(pe, pe->pbus);
-
return;
fail:
/* XXX Failure: Try to fallback to 64-bit only ? */
@@ -2537,7 +2522,7 @@ unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
direct_table_size = 1UL << table_shift;
for ( ; levels; --levels) {
- bytes += _ALIGN_UP(tce_table_size, direct_table_size);
+ bytes += ALIGN(tce_table_size, direct_table_size);
tce_table_size /= direct_table_size;
tce_table_size <<= 3;
@@ -2596,137 +2581,8 @@ static struct iommu_table_group_ops pnv_pci_ioda2_ops = {
.take_ownership = pnv_ioda2_take_ownership,
.release_ownership = pnv_ioda2_release_ownership,
};
-
-static void pnv_ioda_setup_bus_iommu_group_add_devices(struct pnv_ioda_pe *pe,
- struct iommu_table_group *table_group,
- struct pci_bus *bus)
-{
- struct pci_dev *dev;
-
- list_for_each_entry(dev, &bus->devices, bus_list) {
- iommu_add_device(table_group, &dev->dev);
-
- if ((pe->flags & PNV_IODA_PE_BUS_ALL) && dev->subordinate)
- pnv_ioda_setup_bus_iommu_group_add_devices(pe,
- table_group, dev->subordinate);
- }
-}
-
-static void pnv_ioda_setup_bus_iommu_group(struct pnv_ioda_pe *pe,
- struct iommu_table_group *table_group, struct pci_bus *bus)
-{
-
- if (pe->flags & PNV_IODA_PE_DEV)
- iommu_add_device(table_group, &pe->pdev->dev);
-
- if ((pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL)) || bus)
- pnv_ioda_setup_bus_iommu_group_add_devices(pe, table_group,
- bus);
-}
-
-static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb);
-
-static void pnv_pci_ioda_setup_iommu_api(void)
-{
- struct pci_controller *hose;
- struct pnv_phb *phb;
- struct pnv_ioda_pe *pe;
-
- /*
- * There are 4 types of PEs:
- * - PNV_IODA_PE_BUS: a downstream port with an adapter,
- * created from pnv_pci_setup_bridge();
- * - PNV_IODA_PE_BUS_ALL: a PCI-PCIX bridge with devices behind it,
- * created from pnv_pci_setup_bridge();
- * - PNV_IODA_PE_VF: a SRIOV virtual function,
- * created from pnv_pcibios_sriov_enable();
- * - PNV_IODA_PE_DEV: an NPU or OCAPI device,
- * created from pnv_pci_ioda_fixup().
- *
- * Normally a PE is represented by an IOMMU group, however for
- * devices with side channels the groups need to be more strict.
- */
- list_for_each_entry(hose, &hose_list, list_node) {
- phb = hose->private_data;
-
- if (phb->type == PNV_PHB_NPU_NVLINK ||
- phb->type == PNV_PHB_NPU_OCAPI)
- continue;
-
- list_for_each_entry(pe, &phb->ioda.pe_list, list) {
- struct iommu_table_group *table_group;
-
- table_group = pnv_try_setup_npu_table_group(pe);
- if (!table_group) {
- if (!pnv_pci_ioda_pe_dma_weight(pe))
- continue;
-
- table_group = &pe->table_group;
- iommu_register_group(&pe->table_group,
- pe->phb->hose->global_number,
- pe->pe_number);
- }
- pnv_ioda_setup_bus_iommu_group(pe, table_group,
- pe->pbus);
- }
- }
-
- /*
- * Now we have all PHBs discovered, time to add NPU devices to
- * the corresponding IOMMU groups.
- */
- list_for_each_entry(hose, &hose_list, list_node) {
- unsigned long pgsizes;
-
- phb = hose->private_data;
-
- if (phb->type != PNV_PHB_NPU_NVLINK)
- continue;
-
- pgsizes = pnv_ioda_parse_tce_sizes(phb);
- list_for_each_entry(pe, &phb->ioda.pe_list, list) {
- /*
- * IODA2 bridges get this set up from
- * pci_controller_ops::setup_bridge but NPU bridges
- * do not have this hook defined so we do it here.
- */
- pe->table_group.pgsizes = pgsizes;
- pnv_npu_compound_attach(pe);
- }
- }
-}
-#else /* !CONFIG_IOMMU_API */
-static void pnv_pci_ioda_setup_iommu_api(void) { };
#endif
-static unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb)
-{
- struct pci_controller *hose = phb->hose;
- struct device_node *dn = hose->dn;
- unsigned long mask = 0;
- int i, rc, count;
- u32 val;
-
- count = of_property_count_u32_elems(dn, "ibm,supported-tce-sizes");
- if (count <= 0) {
- mask = SZ_4K | SZ_64K;
- /* Add 16M for POWER8 by default */
- if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
- !cpu_has_feature(CPU_FTR_ARCH_300))
- mask |= SZ_16M | SZ_256M;
- return mask;
- }
-
- for (i = 0; i < count; i++) {
- rc = of_property_read_u32_index(dn, "ibm,supported-tce-sizes",
- i, &val);
- if (rc == 0)
- mask |= 1ULL << val;
- }
-
- return mask;
-}
-
static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
struct pnv_ioda_pe *pe)
{
@@ -2749,16 +2605,16 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb,
IOMMU_TABLE_GROUP_MAX_TABLES;
pe->table_group.max_levels = POWERNV_IOMMU_MAX_LEVELS;
pe->table_group.pgsizes = pnv_ioda_parse_tce_sizes(phb);
-#ifdef CONFIG_IOMMU_API
- pe->table_group.ops = &pnv_pci_ioda2_ops;
-#endif
rc = pnv_pci_ioda2_setup_default_config(pe);
if (rc)
return;
- if (pe->flags & (PNV_IODA_PE_BUS | PNV_IODA_PE_BUS_ALL))
- pnv_ioda_setup_bus_dma(pe, pe->pbus);
+#ifdef CONFIG_IOMMU_API
+ pe->table_group.ops = &pnv_pci_ioda2_ops;
+ iommu_register_group(&pe->table_group, phb->hose->global_number,
+ pe->pe_number);
+#endif
}
int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq)
@@ -3220,8 +3076,7 @@ static void pnv_pci_enable_bridges(void)
static void pnv_pci_ioda_fixup(void)
{
- pnv_pci_ioda_setup_PEs();
- pnv_pci_ioda_setup_iommu_api();
+ pnv_pci_ioda_setup_nvlink();
pnv_pci_ioda_create_dbgfs();
pnv_pci_enable_bridges();
@@ -3333,28 +3188,18 @@ static void pnv_pci_fixup_bridge_resources(struct pci_bus *bus,
}
}
-static void pnv_pci_setup_bridge(struct pci_bus *bus, unsigned long type)
+static void pnv_pci_configure_bus(struct pci_bus *bus)
{
struct pci_controller *hose = pci_bus_to_host(bus);
struct pnv_phb *phb = hose->private_data;
struct pci_dev *bridge = bus->self;
struct pnv_ioda_pe *pe;
- bool all = (pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE);
-
- /* Extend bridge's windows if necessary */
- pnv_pci_fixup_bridge_resources(bus, type);
+ bool all = (bridge && pci_pcie_type(bridge) == PCI_EXP_TYPE_PCI_BRIDGE);
- /* The PE for root bus should be realized before any one else */
- if (!phb->ioda.root_pe_populated) {
- pe = pnv_ioda_setup_bus_PE(phb->hose->bus, false);
- if (pe) {
- phb->ioda.root_pe_idx = pe->pe_number;
- phb->ioda.root_pe_populated = true;
- }
- }
+ dev_info(&bus->dev, "Configuring PE for bus\n");
/* Don't assign PE to PCI bus, which doesn't have subordinate devices */
- if (list_empty(&bus->devices))
+ if (WARN_ON(list_empty(&bus->devices)))
return;
/* Reserve PEs according to used M64 resources */
@@ -3599,6 +3444,8 @@ static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe)
struct pnv_phb *phb = pe->phb;
struct pnv_ioda_pe *slave, *tmp;
+ pe_info(pe, "Releasing PE\n");
+
mutex_lock(&phb->ioda.pe_list_mutex);
list_del(&pe->list);
mutex_unlock(&phb->ioda.pe_list_mutex);
@@ -3633,11 +3480,10 @@ static void pnv_ioda_release_pe(struct pnv_ioda_pe *pe)
* that it can be populated again in PCI hot add path. The PE
* shouldn't be destroyed as it's the global reserved resource.
*/
- if (phb->ioda.root_pe_populated &&
- phb->ioda.root_pe_idx == pe->pe_number)
- phb->ioda.root_pe_populated = false;
- else
- pnv_ioda_free_pe(pe);
+ if (phb->ioda.root_pe_idx == pe->pe_number)
+ return;
+
+ pnv_ioda_free_pe(pe);
}
static void pnv_pci_release_device(struct pci_dev *pdev)
@@ -3715,7 +3561,7 @@ static const struct pci_controller_ops pnv_pci_ioda_controller_ops = {
.enable_device_hook = pnv_pci_enable_device_hook,
.release_device = pnv_pci_release_device,
.window_alignment = pnv_pci_window_alignment,
- .setup_bridge = pnv_pci_setup_bridge,
+ .setup_bridge = pnv_pci_fixup_bridge_resources,
.reset_secondary_bus = pnv_pci_reset_secondary_bus,
.shutdown = pnv_pci_ioda_shutdown,
};
@@ -3745,6 +3591,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
struct pnv_phb *phb;
unsigned long size, m64map_off, m32map_off, pemap_off;
unsigned long iomap_off = 0, dma32map_off = 0;
+ struct pnv_ioda_pe *root_pe;
struct resource r;
const __be64 *prop64;
const __be32 *prop32;
@@ -3863,7 +3710,7 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
PNV_IODA1_DMA32_SEGSIZE;
/* Allocate aux data & arrays. We don't have IO ports on PHB3 */
- size = _ALIGN_UP(max_t(unsigned, phb->ioda.total_pe_num, 8) / 8,
+ size = ALIGN(max_t(unsigned, phb->ioda.total_pe_num, 8) / 8,
sizeof(unsigned long));
m64map_off = size;
size += phb->ioda.total_pe_num * sizeof(phb->ioda.m64_segmap[0]);
@@ -3912,7 +3759,9 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np,
phb->ioda.root_pe_idx = phb->ioda.reserved_pe_idx - 1;
pnv_ioda_reserve_pe(phb, phb->ioda.root_pe_idx);
} else {
- phb->ioda.root_pe_idx = IODA_INVALID_PE;
+ /* otherwise just allocate one */
+ root_pe = pnv_ioda_alloc_pe(phb);
+ phb->ioda.root_pe_idx = root_pe->pe_number;
}
INIT_LIST_HEAD(&phb->ioda.pe_list);
diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c
index 5bf818246339..091fe1cf386b 100644
--- a/arch/powerpc/platforms/powernv/pci.c
+++ b/arch/powerpc/platforms/powernv/pci.c
@@ -955,28 +955,8 @@ static int pnv_tce_iommu_bus_notifier(struct notifier_block *nb,
unsigned long action, void *data)
{
struct device *dev = data;
- struct pci_dev *pdev;
- struct pci_dn *pdn;
- struct pnv_ioda_pe *pe;
- struct pci_controller *hose;
- struct pnv_phb *phb;
switch (action) {
- case BUS_NOTIFY_ADD_DEVICE:
- pdev = to_pci_dev(dev);
- pdn = pci_get_pdn(pdev);
- hose = pci_bus_to_host(pdev->bus);
- phb = hose->private_data;
-
- WARN_ON_ONCE(!phb);
- if (!pdn || pdn->pe_number == IODA_INVALID_PE || !phb)
- return 0;
-
- pe = &phb->ioda.pe_array[pdn->pe_number];
- if (!pe->table_group.group)
- return 0;
- iommu_add_device(&pe->table_group, dev);
- return 0;
case BUS_NOTIFY_DEL_DEVICE:
iommu_del_device(dev);
return 0;
diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h
index d3bbdeab3a32..51c254f2f3cb 100644
--- a/arch/powerpc/platforms/powernv/pci.h
+++ b/arch/powerpc/platforms/powernv/pci.h
@@ -33,6 +33,24 @@ enum pnv_phb_model {
#define PNV_IODA_PE_SLAVE (1 << 4) /* Slave PE in compound case */
#define PNV_IODA_PE_VF (1 << 5) /* PE for one VF */
+/*
+ * A brief note on PNV_IODA_PE_BUS_ALL
+ *
+ * This is needed because of the behaviour of PCIe-to-PCI bridges. The PHB uses
+ * the Requester ID field of the PCIe request header to determine the device
+ * (and PE) that initiated a DMA. In legacy PCI individual memory read/write
+ * requests aren't tagged with the RID. To work around this the PCIe-to-PCI
+ * bridge will use (secondary_bus_no << 8) | 0x00 as the RID on the PCIe side.
+ *
+ * PCIe-to-X bridges have a similar issue even though PCI-X requests also have
+ * a RID in the transaction header. The PCIe-to-X bridge is permitted to "take
+ * ownership" of a transaction by a PCI-X device when forwarding it to the PCIe
+ * side of the bridge.
+ *
+ * To work around these problems we use the BUS_ALL flag since every subordinate
+ * bus of the bridge should go into the same PE.
+ */
+
/* Indicates operations are frozen for a PE: MMIO in PESTA & DMA in PESTB. */
#define PNV_IODA_STOPPED_STATE 0x8000000000000000
@@ -118,7 +136,6 @@ struct pnv_phb {
unsigned int total_pe_num;
unsigned int reserved_pe_idx;
unsigned int root_pe_idx;
- bool root_pe_populated;
/* 32-bit MMIO window */
unsigned int m32_size;
@@ -190,6 +207,7 @@ extern int pnv_eeh_phb_reset(struct pci_controller *hose, int option);
extern int pnv_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type);
extern void pnv_teardown_msi_irqs(struct pci_dev *pdev);
+extern struct pnv_ioda_pe *pnv_pci_bdfn_to_pe(struct pnv_phb *phb, u16 bdfn);
extern struct pnv_ioda_pe *pnv_ioda_get_pe(struct pci_dev *dev);
extern void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq);
extern unsigned long pnv_pci_ioda2_get_table_size(__u32 page_shift,
@@ -209,11 +227,7 @@ extern void pe_level_printk(const struct pnv_ioda_pe *pe, const char *level,
/* Nvlink functions */
extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass);
extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm);
-extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe);
-extern struct iommu_table_group *pnv_try_setup_npu_table_group(
- struct pnv_ioda_pe *pe);
-extern struct iommu_table_group *pnv_npu_compound_attach(
- struct pnv_ioda_pe *pe);
+extern void pnv_pci_npu_setup_iommu_groups(void);
/* pci-ioda-tce.c */
#define POWERNV_IOMMU_DEFAULT_LEVELS 2
@@ -244,4 +258,6 @@ extern void pnv_pci_setup_iommu_table(struct iommu_table *tbl,
void *tce_mem, u64 tce_size,
u64 dma_offset, unsigned int page_shift);
+extern unsigned long pnv_ioda_parse_tce_sizes(struct pnv_phb *phb);
+
#endif /* __POWERNV_PCI_H */
diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c
index 13e251699346..b2ba3e95bda7 100644
--- a/arch/powerpc/platforms/powernv/smp.c
+++ b/arch/powerpc/platforms/powernv/smp.c
@@ -167,7 +167,6 @@ static void pnv_smp_cpu_kill_self(void)
/* Standard hot unplug procedure */
idle_task_exit();
- current->active_mm = NULL; /* for sanity */
cpu = smp_processor_id();
DBG("CPU%d offline\n", cpu);
generic_set_cpu_dead(cpu);
diff --git a/arch/powerpc/platforms/powernv/vas-api.c b/arch/powerpc/platforms/powernv/vas-api.c
new file mode 100644
index 000000000000..98ed5d8c5441
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/vas-api.c
@@ -0,0 +1,278 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * VAS user space API for its accelerators (Only NX-GZIP is supported now)
+ * Copyright (C) 2019 Haren Myneni, IBM Corp
+ */
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/cdev.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <asm/vas.h>
+#include <uapi/asm/vas-api.h>
+#include "vas.h"
+
+/*
+ * The driver creates the device node that can be used as follows:
+ * For NX-GZIP
+ *
+ * fd = open("/dev/crypto/nx-gzip", O_RDWR);
+ * rc = ioctl(fd, VAS_TX_WIN_OPEN, &attr);
+ * paste_addr = mmap(NULL, PAGE_SIZE, prot, MAP_SHARED, fd, 0ULL).
+ * vas_copy(&crb, 0, 1);
+ * vas_paste(paste_addr, 0, 1);
+ * close(fd) or exit process to close window.
+ *
+ * where "vas_copy" and "vas_paste" are defined in copy-paste.h.
+ * copy/paste returns to the user space directly. So refer NX hardware
+ * documententation for exact copy/paste usage and completion / error
+ * conditions.
+ */
+
+/*
+ * Wrapper object for the nx-gzip device - there is just one instance of
+ * this node for the whole system.
+ */
+static struct coproc_dev {
+ struct cdev cdev;
+ struct device *device;
+ char *name;
+ dev_t devt;
+ struct class *class;
+ enum vas_cop_type cop_type;
+} coproc_device;
+
+struct coproc_instance {
+ struct coproc_dev *coproc;
+ struct vas_window *txwin;
+};
+
+static char *coproc_devnode(struct device *dev, umode_t *mode)
+{
+ return kasprintf(GFP_KERNEL, "crypto/%s", dev_name(dev));
+}
+
+static int coproc_open(struct inode *inode, struct file *fp)
+{
+ struct coproc_instance *cp_inst;
+
+ cp_inst = kzalloc(sizeof(*cp_inst), GFP_KERNEL);
+ if (!cp_inst)
+ return -ENOMEM;
+
+ cp_inst->coproc = container_of(inode->i_cdev, struct coproc_dev,
+ cdev);
+ fp->private_data = cp_inst;
+
+ return 0;
+}
+
+static int coproc_ioc_tx_win_open(struct file *fp, unsigned long arg)
+{
+ void __user *uptr = (void __user *)arg;
+ struct vas_tx_win_attr txattr = {};
+ struct vas_tx_win_open_attr uattr;
+ struct coproc_instance *cp_inst;
+ struct vas_window *txwin;
+ int rc, vasid;
+
+ cp_inst = fp->private_data;
+
+ /*
+ * One window for file descriptor
+ */
+ if (cp_inst->txwin)
+ return -EEXIST;
+
+ rc = copy_from_user(&uattr, uptr, sizeof(uattr));
+ if (rc) {
+ pr_err("%s(): copy_from_user() returns %d\n", __func__, rc);
+ return -EFAULT;
+ }
+
+ if (uattr.version != 1) {
+ pr_err("Invalid version\n");
+ return -EINVAL;
+ }
+
+ vasid = uattr.vas_id;
+
+ vas_init_tx_win_attr(&txattr, cp_inst->coproc->cop_type);
+
+ txattr.lpid = mfspr(SPRN_LPID);
+ txattr.pidr = mfspr(SPRN_PID);
+ txattr.user_win = true;
+ txattr.rsvd_txbuf_count = false;
+ txattr.pswid = false;
+
+ pr_devel("Pid %d: Opening txwin, PIDR %ld\n", txattr.pidr,
+ mfspr(SPRN_PID));
+
+ txwin = vas_tx_win_open(vasid, cp_inst->coproc->cop_type, &txattr);
+ if (IS_ERR(txwin)) {
+ pr_err("%s() vas_tx_win_open() failed, %ld\n", __func__,
+ PTR_ERR(txwin));
+ return PTR_ERR(txwin);
+ }
+
+ cp_inst->txwin = txwin;
+
+ return 0;
+}
+
+static int coproc_release(struct inode *inode, struct file *fp)
+{
+ struct coproc_instance *cp_inst = fp->private_data;
+
+ if (cp_inst->txwin) {
+ vas_win_close(cp_inst->txwin);
+ cp_inst->txwin = NULL;
+ }
+
+ kfree(cp_inst);
+ fp->private_data = NULL;
+
+ /*
+ * We don't know here if user has other receive windows
+ * open, so we can't really call clear_thread_tidr().
+ * So, once the process calls set_thread_tidr(), the
+ * TIDR value sticks around until process exits, resulting
+ * in an extra copy in restore_sprs().
+ */
+
+ return 0;
+}
+
+static int coproc_mmap(struct file *fp, struct vm_area_struct *vma)
+{
+ struct coproc_instance *cp_inst = fp->private_data;
+ struct vas_window *txwin;
+ unsigned long pfn;
+ u64 paste_addr;
+ pgprot_t prot;
+ int rc;
+
+ txwin = cp_inst->txwin;
+
+ if ((vma->vm_end - vma->vm_start) > PAGE_SIZE) {
+ pr_debug("%s(): size 0x%zx, PAGE_SIZE 0x%zx\n", __func__,
+ (vma->vm_end - vma->vm_start), PAGE_SIZE);
+ return -EINVAL;
+ }
+
+ /* Ensure instance has an open send window */
+ if (!txwin) {
+ pr_err("%s(): No send window open?\n", __func__);
+ return -EINVAL;
+ }
+
+ vas_win_paste_addr(txwin, &paste_addr, NULL);
+ pfn = paste_addr >> PAGE_SHIFT;
+
+ /* flags, page_prot from cxl_mmap(), except we want cachable */
+ vma->vm_flags |= VM_IO | VM_PFNMAP;
+ vma->vm_page_prot = pgprot_cached(vma->vm_page_prot);
+
+ prot = __pgprot(pgprot_val(vma->vm_page_prot) | _PAGE_DIRTY);
+
+ rc = remap_pfn_range(vma, vma->vm_start, pfn + vma->vm_pgoff,
+ vma->vm_end - vma->vm_start, prot);
+
+ pr_devel("%s(): paste addr %llx at %lx, rc %d\n", __func__,
+ paste_addr, vma->vm_start, rc);
+
+ return rc;
+}
+
+static long coproc_ioctl(struct file *fp, unsigned int cmd, unsigned long arg)
+{
+ switch (cmd) {
+ case VAS_TX_WIN_OPEN:
+ return coproc_ioc_tx_win_open(fp, arg);
+ default:
+ return -EINVAL;
+ }
+}
+
+static struct file_operations coproc_fops = {
+ .open = coproc_open,
+ .release = coproc_release,
+ .mmap = coproc_mmap,
+ .unlocked_ioctl = coproc_ioctl,
+};
+
+/*
+ * Supporting only nx-gzip coprocessor type now, but this API code
+ * extended to other coprocessor types later.
+ */
+int vas_register_coproc_api(struct module *mod, enum vas_cop_type cop_type,
+ const char *name)
+{
+ int rc = -EINVAL;
+ dev_t devno;
+
+ rc = alloc_chrdev_region(&coproc_device.devt, 1, 1, name);
+ if (rc) {
+ pr_err("Unable to allocate coproc major number: %i\n", rc);
+ return rc;
+ }
+
+ pr_devel("%s device allocated, dev [%i,%i]\n", name,
+ MAJOR(coproc_device.devt), MINOR(coproc_device.devt));
+
+ coproc_device.class = class_create(mod, name);
+ if (IS_ERR(coproc_device.class)) {
+ rc = PTR_ERR(coproc_device.class);
+ pr_err("Unable to create %s class %d\n", name, rc);
+ goto err_class;
+ }
+ coproc_device.class->devnode = coproc_devnode;
+ coproc_device.cop_type = cop_type;
+
+ coproc_fops.owner = mod;
+ cdev_init(&coproc_device.cdev, &coproc_fops);
+
+ devno = MKDEV(MAJOR(coproc_device.devt), 0);
+ rc = cdev_add(&coproc_device.cdev, devno, 1);
+ if (rc) {
+ pr_err("cdev_add() failed %d\n", rc);
+ goto err_cdev;
+ }
+
+ coproc_device.device = device_create(coproc_device.class, NULL,
+ devno, NULL, name, MINOR(devno));
+ if (IS_ERR(coproc_device.device)) {
+ rc = PTR_ERR(coproc_device.device);
+ pr_err("Unable to create coproc-%d %d\n", MINOR(devno), rc);
+ goto err;
+ }
+
+ pr_devel("%s: Added dev [%d,%d]\n", __func__, MAJOR(devno),
+ MINOR(devno));
+
+ return 0;
+
+err:
+ cdev_del(&coproc_device.cdev);
+err_cdev:
+ class_destroy(coproc_device.class);
+err_class:
+ unregister_chrdev_region(coproc_device.devt, 1);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(vas_register_coproc_api);
+
+void vas_unregister_coproc_api(void)
+{
+ dev_t devno;
+
+ cdev_del(&coproc_device.cdev);
+ devno = MKDEV(MAJOR(coproc_device.devt), 0);
+ device_destroy(coproc_device.class, devno);
+
+ class_destroy(coproc_device.class);
+ unregister_chrdev_region(coproc_device.devt, 1);
+}
+EXPORT_SYMBOL_GPL(vas_unregister_coproc_api);
diff --git a/arch/powerpc/platforms/powernv/vas-debug.c b/arch/powerpc/platforms/powernv/vas-debug.c
index 44035a3d6414..41fa90d2f4ab 100644
--- a/arch/powerpc/platforms/powernv/vas-debug.c
+++ b/arch/powerpc/platforms/powernv/vas-debug.c
@@ -38,7 +38,7 @@ static int info_show(struct seq_file *s, void *private)
seq_printf(s, "Type: %s, %s\n", cop_to_str(window->cop),
window->tx_win ? "Send" : "Receive");
- seq_printf(s, "Pid : %d\n", window->pid);
+ seq_printf(s, "Pid : %d\n", vas_window_pid(window));
unlock:
mutex_unlock(&vas_mutex);
diff --git a/arch/powerpc/platforms/powernv/vas-fault.c b/arch/powerpc/platforms/powernv/vas-fault.c
new file mode 100644
index 000000000000..25db70be4c9c
--- /dev/null
+++ b/arch/powerpc/platforms/powernv/vas-fault.c
@@ -0,0 +1,382 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * VAS Fault handling.
+ * Copyright 2019, IBM Corporation
+ */
+
+#define pr_fmt(fmt) "vas: " fmt
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+#include <linux/kthread.h>
+#include <linux/sched/signal.h>
+#include <linux/mmu_context.h>
+#include <asm/icswx.h>
+
+#include "vas.h"
+
+/*
+ * The maximum FIFO size for fault window can be 8MB
+ * (VAS_RX_FIFO_SIZE_MAX). Using 4MB FIFO since each VAS
+ * instance will be having fault window.
+ * 8MB FIFO can be used if expects more faults for each VAS
+ * instance.
+ */
+#define VAS_FAULT_WIN_FIFO_SIZE (4 << 20)
+
+static void dump_crb(struct coprocessor_request_block *crb)
+{
+ struct data_descriptor_entry *dde;
+ struct nx_fault_stamp *nx;
+
+ dde = &crb->source;
+ pr_devel("SrcDDE: addr 0x%llx, len %d, count %d, idx %d, flags %d\n",
+ be64_to_cpu(dde->address), be32_to_cpu(dde->length),
+ dde->count, dde->index, dde->flags);
+
+ dde = &crb->target;
+ pr_devel("TgtDDE: addr 0x%llx, len %d, count %d, idx %d, flags %d\n",
+ be64_to_cpu(dde->address), be32_to_cpu(dde->length),
+ dde->count, dde->index, dde->flags);
+
+ nx = &crb->stamp.nx;
+ pr_devel("NX Stamp: PSWID 0x%x, FSA 0x%llx, flags 0x%x, FS 0x%x\n",
+ be32_to_cpu(nx->pswid),
+ be64_to_cpu(crb->stamp.nx.fault_storage_addr),
+ nx->flags, nx->fault_status);
+}
+
+/*
+ * Update the CSB to indicate a translation error.
+ *
+ * User space will be polling on CSB after the request is issued.
+ * If NX can handle the request without any issues, it updates CSB.
+ * Whereas if NX encounters page fault, the kernel will handle the
+ * fault and update CSB with translation error.
+ *
+ * If we are unable to update the CSB means copy_to_user failed due to
+ * invalid csb_addr, send a signal to the process.
+ */
+static void update_csb(struct vas_window *window,
+ struct coprocessor_request_block *crb)
+{
+ struct coprocessor_status_block csb;
+ struct kernel_siginfo info;
+ struct task_struct *tsk;
+ void __user *csb_addr;
+ struct pid *pid;
+ int rc;
+
+ /*
+ * NX user space windows can not be opened for task->mm=NULL
+ * and faults will not be generated for kernel requests.
+ */
+ if (WARN_ON_ONCE(!window->mm || !window->user_win))
+ return;
+
+ csb_addr = (void __user *)be64_to_cpu(crb->csb_addr);
+
+ memset(&csb, 0, sizeof(csb));
+ csb.cc = CSB_CC_TRANSLATION;
+ csb.ce = CSB_CE_TERMINATION;
+ csb.cs = 0;
+ csb.count = 0;
+
+ /*
+ * NX operates and returns in BE format as defined CRB struct.
+ * So saves fault_storage_addr in BE as NX pastes in FIFO and
+ * expects user space to convert to CPU format.
+ */
+ csb.address = crb->stamp.nx.fault_storage_addr;
+ csb.flags = 0;
+
+ pid = window->pid;
+ tsk = get_pid_task(pid, PIDTYPE_PID);
+ /*
+ * Process closes send window after all pending NX requests are
+ * completed. In multi-thread applications, a child thread can
+ * open a window and can exit without closing it. May be some
+ * requests are pending or this window can be used by other
+ * threads later. We should handle faults if NX encounters
+ * pages faults on these requests. Update CSB with translation
+ * error and fault address. If csb_addr passed by user space is
+ * invalid, send SEGV signal to pid saved in window. If the
+ * child thread is not running, send the signal to tgid.
+ * Parent thread (tgid) will close this window upon its exit.
+ *
+ * pid and mm references are taken when window is opened by
+ * process (pid). So tgid is used only when child thread opens
+ * a window and exits without closing it.
+ */
+ if (!tsk) {
+ pid = window->tgid;
+ tsk = get_pid_task(pid, PIDTYPE_PID);
+ /*
+ * Parent thread (tgid) will be closing window when it
+ * exits. So should not get here.
+ */
+ if (WARN_ON_ONCE(!tsk))
+ return;
+ }
+
+ /* Return if the task is exiting. */
+ if (tsk->flags & PF_EXITING) {
+ put_task_struct(tsk);
+ return;
+ }
+
+ use_mm(window->mm);
+ rc = copy_to_user(csb_addr, &csb, sizeof(csb));
+ /*
+ * User space polls on csb.flags (first byte). So add barrier
+ * then copy first byte with csb flags update.
+ */
+ if (!rc) {
+ csb.flags = CSB_V;
+ /* Make sure update to csb.flags is visible now */
+ smp_mb();
+ rc = copy_to_user(csb_addr, &csb, sizeof(u8));
+ }
+ unuse_mm(window->mm);
+ put_task_struct(tsk);
+
+ /* Success */
+ if (!rc)
+ return;
+
+ pr_debug("Invalid CSB address 0x%p signalling pid(%d)\n",
+ csb_addr, pid_vnr(pid));
+
+ clear_siginfo(&info);
+ info.si_signo = SIGSEGV;
+ info.si_errno = EFAULT;
+ info.si_code = SEGV_MAPERR;
+ info.si_addr = csb_addr;
+
+ /*
+ * process will be polling on csb.flags after request is sent to
+ * NX. So generally CSB update should not fail except when an
+ * application passes invalid csb_addr. So an error message will
+ * be displayed and leave it to user space whether to ignore or
+ * handle this signal.
+ */
+ rcu_read_lock();
+ rc = kill_pid_info(SIGSEGV, &info, pid);
+ rcu_read_unlock();
+
+ pr_devel("%s(): pid %d kill_proc_info() rc %d\n", __func__,
+ pid_vnr(pid), rc);
+}
+
+static void dump_fifo(struct vas_instance *vinst, void *entry)
+{
+ unsigned long *end = vinst->fault_fifo + vinst->fault_fifo_size;
+ unsigned long *fifo = entry;
+ int i;
+
+ pr_err("Fault fifo size %d, Max crbs %d\n", vinst->fault_fifo_size,
+ vinst->fault_fifo_size / CRB_SIZE);
+
+ /* Dump 10 CRB entries or until end of FIFO */
+ pr_err("Fault FIFO Dump:\n");
+ for (i = 0; i < 10*(CRB_SIZE/8) && fifo < end; i += 4, fifo += 4) {
+ pr_err("[%.3d, %p]: 0x%.16lx 0x%.16lx 0x%.16lx 0x%.16lx\n",
+ i, fifo, *fifo, *(fifo+1), *(fifo+2), *(fifo+3));
+ }
+}
+
+/*
+ * Process valid CRBs in fault FIFO.
+ * NX process user space requests, return credit and update the status
+ * in CRB. If it encounters transalation error when accessing CRB or
+ * request buffers, raises interrupt on the CPU to handle the fault.
+ * It takes credit on fault window, updates nx_fault_stamp in CRB with
+ * the following information and pastes CRB in fault FIFO.
+ *
+ * pswid - window ID of the window on which the request is sent.
+ * fault_storage_addr - fault address
+ *
+ * It can raise a single interrupt for multiple faults. Expects OS to
+ * process all valid faults and return credit for each fault on user
+ * space and fault windows. This fault FIFO control will be done with
+ * credit mechanism. NX can continuously paste CRBs until credits are not
+ * available on fault window. Otherwise, returns with RMA_reject.
+ *
+ * Total credits available on fault window: FIFO_SIZE(4MB)/CRBS_SIZE(128)
+ *
+ */
+irqreturn_t vas_fault_thread_fn(int irq, void *data)
+{
+ struct vas_instance *vinst = data;
+ struct coprocessor_request_block *crb, *entry;
+ struct coprocessor_request_block buf;
+ struct vas_window *window;
+ unsigned long flags;
+ void *fifo;
+
+ crb = &buf;
+
+ /*
+ * VAS can interrupt with multiple page faults. So process all
+ * valid CRBs within fault FIFO until reaches invalid CRB.
+ * We use CCW[0] and pswid to validate validate CRBs:
+ *
+ * CCW[0] Reserved bit. When NX pastes CRB, CCW[0]=0
+ * OS sets this bit to 1 after reading CRB.
+ * pswid NX assigns window ID. Set pswid to -1 after
+ * reading CRB from fault FIFO.
+ *
+ * We exit this function if no valid CRBs are available to process.
+ * So acquire fault_lock and reset fifo_in_progress to 0 before
+ * exit.
+ * In case kernel receives another interrupt with different page
+ * fault, interrupt handler returns with IRQ_HANDLED if
+ * fifo_in_progress is set. Means these new faults will be
+ * handled by the current thread. Otherwise set fifo_in_progress
+ * and return IRQ_WAKE_THREAD to wake up thread.
+ */
+ while (true) {
+ spin_lock_irqsave(&vinst->fault_lock, flags);
+ /*
+ * Advance the fault fifo pointer to next CRB.
+ * Use CRB_SIZE rather than sizeof(*crb) since the latter is
+ * aligned to CRB_ALIGN (256) but the CRB written to by VAS is
+ * only CRB_SIZE in len.
+ */
+ fifo = vinst->fault_fifo + (vinst->fault_crbs * CRB_SIZE);
+ entry = fifo;
+
+ if ((entry->stamp.nx.pswid == cpu_to_be32(FIFO_INVALID_ENTRY))
+ || (entry->ccw & cpu_to_be32(CCW0_INVALID))) {
+ vinst->fifo_in_progress = 0;
+ spin_unlock_irqrestore(&vinst->fault_lock, flags);
+ return IRQ_HANDLED;
+ }
+
+ spin_unlock_irqrestore(&vinst->fault_lock, flags);
+ vinst->fault_crbs++;
+ if (vinst->fault_crbs == (vinst->fault_fifo_size / CRB_SIZE))
+ vinst->fault_crbs = 0;
+
+ memcpy(crb, fifo, CRB_SIZE);
+ entry->stamp.nx.pswid = cpu_to_be32(FIFO_INVALID_ENTRY);
+ entry->ccw |= cpu_to_be32(CCW0_INVALID);
+ /*
+ * Return credit for the fault window.
+ */
+ vas_return_credit(vinst->fault_win, false);
+
+ pr_devel("VAS[%d] fault_fifo %p, fifo %p, fault_crbs %d\n",
+ vinst->vas_id, vinst->fault_fifo, fifo,
+ vinst->fault_crbs);
+
+ dump_crb(crb);
+ window = vas_pswid_to_window(vinst,
+ be32_to_cpu(crb->stamp.nx.pswid));
+
+ if (IS_ERR(window)) {
+ /*
+ * We got an interrupt about a specific send
+ * window but we can't find that window and we can't
+ * even clean it up (return credit on user space
+ * window).
+ * But we should not get here.
+ * TODO: Disable IRQ.
+ */
+ dump_fifo(vinst, (void *)entry);
+ pr_err("VAS[%d] fault_fifo %p, fifo %p, pswid 0x%x, fault_crbs %d bad CRB?\n",
+ vinst->vas_id, vinst->fault_fifo, fifo,
+ be32_to_cpu(crb->stamp.nx.pswid),
+ vinst->fault_crbs);
+
+ WARN_ON_ONCE(1);
+ } else {
+ update_csb(window, crb);
+ /*
+ * Return credit for send window after processing
+ * fault CRB.
+ */
+ vas_return_credit(window, true);
+ }
+ }
+}
+
+irqreturn_t vas_fault_handler(int irq, void *dev_id)
+{
+ struct vas_instance *vinst = dev_id;
+ irqreturn_t ret = IRQ_WAKE_THREAD;
+ unsigned long flags;
+
+ /*
+ * NX can generate an interrupt for multiple faults. So the
+ * fault handler thread process all CRBs until finds invalid
+ * entry. In case if NX sees continuous faults, it is possible
+ * that the thread function entered with the first interrupt
+ * can execute and process all valid CRBs.
+ * So wake up thread only if the fault thread is not in progress.
+ */
+ spin_lock_irqsave(&vinst->fault_lock, flags);
+
+ if (vinst->fifo_in_progress)
+ ret = IRQ_HANDLED;
+ else
+ vinst->fifo_in_progress = 1;
+
+ spin_unlock_irqrestore(&vinst->fault_lock, flags);
+
+ return ret;
+}
+
+/*
+ * Fault window is opened per VAS instance. NX pastes fault CRB in fault
+ * FIFO upon page faults.
+ */
+int vas_setup_fault_window(struct vas_instance *vinst)
+{
+ struct vas_rx_win_attr attr;
+
+ vinst->fault_fifo_size = VAS_FAULT_WIN_FIFO_SIZE;
+ vinst->fault_fifo = kzalloc(vinst->fault_fifo_size, GFP_KERNEL);
+ if (!vinst->fault_fifo) {
+ pr_err("Unable to alloc %d bytes for fault_fifo\n",
+ vinst->fault_fifo_size);
+ return -ENOMEM;
+ }
+
+ /*
+ * Invalidate all CRB entries. NX pastes valid entry for each fault.
+ */
+ memset(vinst->fault_fifo, FIFO_INVALID_ENTRY, vinst->fault_fifo_size);
+ vas_init_rx_win_attr(&attr, VAS_COP_TYPE_FAULT);
+
+ attr.rx_fifo_size = vinst->fault_fifo_size;
+ attr.rx_fifo = vinst->fault_fifo;
+
+ /*
+ * Max creds is based on number of CRBs can fit in the FIFO.
+ * (fault_fifo_size/CRB_SIZE). If 8MB FIFO is used, max creds
+ * will be 0xffff since the receive creds field is 16bits wide.
+ */
+ attr.wcreds_max = vinst->fault_fifo_size / CRB_SIZE;
+ attr.lnotify_lpid = 0;
+ attr.lnotify_pid = mfspr(SPRN_PID);
+ attr.lnotify_tid = mfspr(SPRN_PID);
+
+ vinst->fault_win = vas_rx_win_open(vinst->vas_id, VAS_COP_TYPE_FAULT,
+ &attr);
+
+ if (IS_ERR(vinst->fault_win)) {
+ pr_err("VAS: Error %ld opening FaultWin\n",
+ PTR_ERR(vinst->fault_win));
+ kfree(vinst->fault_fifo);
+ return PTR_ERR(vinst->fault_win);
+ }
+
+ pr_devel("VAS: Created FaultWin %d, LPID/PID/TID [%d/%d/%d]\n",
+ vinst->fault_win->winid, attr.lnotify_lpid,
+ attr.lnotify_pid, attr.lnotify_tid);
+
+ return 0;
+}
diff --git a/arch/powerpc/platforms/powernv/vas-window.c b/arch/powerpc/platforms/powernv/vas-window.c
index 0c0d27d17976..6434f9cb5aed 100644
--- a/arch/powerpc/platforms/powernv/vas-window.c
+++ b/arch/powerpc/platforms/powernv/vas-window.c
@@ -12,6 +12,8 @@
#include <linux/log2.h>
#include <linux/rcupdate.h>
#include <linux/cred.h>
+#include <linux/sched/mm.h>
+#include <linux/mmu_context.h>
#include <asm/switch_to.h>
#include <asm/ppc-opcode.h>
#include "vas.h"
@@ -24,7 +26,7 @@
* Compute the paste address region for the window @window using the
* ->paste_base_addr and ->paste_win_id_shift we got from device tree.
*/
-static void compute_paste_address(struct vas_window *window, u64 *addr, int *len)
+void vas_win_paste_addr(struct vas_window *window, u64 *addr, int *len)
{
int winid;
u64 base, shift;
@@ -78,7 +80,7 @@ static void *map_paste_region(struct vas_window *txwin)
goto free_name;
txwin->paste_addr_name = name;
- compute_paste_address(txwin, &start, &len);
+ vas_win_paste_addr(txwin, &start, &len);
if (!request_mem_region(start, len, name)) {
pr_devel("%s(): request_mem_region(0x%llx, %d) failed\n",
@@ -136,7 +138,7 @@ static void unmap_paste_region(struct vas_window *window)
u64 busaddr_start;
if (window->paste_kaddr) {
- compute_paste_address(window, &busaddr_start, &len);
+ vas_win_paste_addr(window, &busaddr_start, &len);
unmap_region(window->paste_kaddr, busaddr_start, len);
window->paste_kaddr = NULL;
kfree(window->paste_addr_name);
@@ -373,7 +375,7 @@ int init_winctx_regs(struct vas_window *window, struct vas_winctx *winctx)
init_xlate_regs(window, winctx->user_win);
val = 0ULL;
- val = SET_FIELD(VAS_FAULT_TX_WIN, val, 0);
+ val = SET_FIELD(VAS_FAULT_TX_WIN, val, winctx->fault_win_id);
write_hvwc_reg(window, VREG(FAULT_TX_WIN), val);
/* In PowerNV, interrupts go to HV. */
@@ -748,6 +750,8 @@ static void init_winctx_for_rxwin(struct vas_window *rxwin,
winctx->min_scope = VAS_SCOPE_LOCAL;
winctx->max_scope = VAS_SCOPE_VECTORED_GROUP;
+ if (rxwin->vinst->virq)
+ winctx->irq_port = rxwin->vinst->irq_port;
}
static bool rx_win_args_valid(enum vas_cop_type cop,
@@ -768,7 +772,7 @@ static bool rx_win_args_valid(enum vas_cop_type cop,
if (attr->rx_fifo_size > VAS_RX_FIFO_SIZE_MAX)
return false;
- if (attr->wcreds_max > VAS_RX_WCREDS_MAX)
+ if (!attr->wcreds_max)
return false;
if (attr->nx_win) {
@@ -813,7 +817,8 @@ void vas_init_rx_win_attr(struct vas_rx_win_attr *rxattr, enum vas_cop_type cop)
{
memset(rxattr, 0, sizeof(*rxattr));
- if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI) {
+ if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI ||
+ cop == VAS_COP_TYPE_GZIP || cop == VAS_COP_TYPE_GZIP_HIPRI) {
rxattr->pin_win = true;
rxattr->nx_win = true;
rxattr->fault_win = false;
@@ -827,9 +832,9 @@ void vas_init_rx_win_attr(struct vas_rx_win_attr *rxattr, enum vas_cop_type cop)
rxattr->fault_win = true;
rxattr->notify_disable = true;
rxattr->rx_wcred_mode = true;
- rxattr->tx_wcred_mode = true;
rxattr->rx_win_ord_mode = true;
- rxattr->tx_win_ord_mode = true;
+ rxattr->rej_no_credit = true;
+ rxattr->tc_mode = VAS_THRESH_DISABLED;
} else if (cop == VAS_COP_TYPE_FTW) {
rxattr->user_win = true;
rxattr->intr_disable = true;
@@ -873,9 +878,7 @@ struct vas_window *vas_rx_win_open(int vasid, enum vas_cop_type cop,
rxwin->nx_win = rxattr->nx_win;
rxwin->user_win = rxattr->user_win;
rxwin->cop = cop;
- rxwin->wcreds_max = rxattr->wcreds_max ?: VAS_WCREDS_DEFAULT;
- if (rxattr->user_win)
- rxwin->pid = task_pid_vnr(current);
+ rxwin->wcreds_max = rxattr->wcreds_max;
init_winctx_for_rxwin(rxwin, rxattr, &winctx);
init_winctx_regs(rxwin, &winctx);
@@ -890,7 +893,8 @@ void vas_init_tx_win_attr(struct vas_tx_win_attr *txattr, enum vas_cop_type cop)
{
memset(txattr, 0, sizeof(*txattr));
- if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI) {
+ if (cop == VAS_COP_TYPE_842 || cop == VAS_COP_TYPE_842_HIPRI ||
+ cop == VAS_COP_TYPE_GZIP || cop == VAS_COP_TYPE_GZIP_HIPRI) {
txattr->rej_no_credit = false;
txattr->rx_wcred_mode = true;
txattr->tx_wcred_mode = true;
@@ -944,13 +948,22 @@ static void init_winctx_for_txwin(struct vas_window *txwin,
winctx->lpid = txattr->lpid;
winctx->pidr = txattr->pidr;
winctx->rx_win_id = txwin->rxwin->winid;
+ /*
+ * IRQ and fault window setup is successful. Set fault window
+ * for the send window so that ready to handle faults.
+ */
+ if (txwin->vinst->virq)
+ winctx->fault_win_id = txwin->vinst->fault_win->winid;
winctx->dma_type = VAS_DMA_TYPE_INJECT;
winctx->tc_mode = txattr->tc_mode;
winctx->min_scope = VAS_SCOPE_LOCAL;
winctx->max_scope = VAS_SCOPE_VECTORED_GROUP;
+ if (txwin->vinst->virq)
+ winctx->irq_port = txwin->vinst->irq_port;
- winctx->pswid = 0;
+ winctx->pswid = txattr->pswid ? txattr->pswid :
+ encode_pswid(txwin->vinst->vas_id, txwin->winid);
}
static bool tx_win_args_valid(enum vas_cop_type cop,
@@ -965,9 +978,14 @@ static bool tx_win_args_valid(enum vas_cop_type cop,
if (attr->wcreds_max > VAS_TX_WCREDS_MAX)
return false;
- if (attr->user_win &&
- (cop != VAS_COP_TYPE_FTW || attr->rsvd_txbuf_count))
- return false;
+ if (attr->user_win) {
+ if (attr->rsvd_txbuf_count)
+ return false;
+
+ if (cop != VAS_COP_TYPE_FTW && cop != VAS_COP_TYPE_GZIP &&
+ cop != VAS_COP_TYPE_GZIP_HIPRI)
+ return false;
+ }
return true;
}
@@ -1016,7 +1034,6 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop,
txwin->tx_win = 1;
txwin->rxwin = rxwin;
txwin->nx_win = txwin->rxwin->nx_win;
- txwin->pid = attr->pid;
txwin->user_win = attr->user_win;
txwin->wcreds_max = attr->wcreds_max ?: VAS_WCREDS_DEFAULT;
@@ -1040,12 +1057,59 @@ struct vas_window *vas_tx_win_open(int vasid, enum vas_cop_type cop,
}
} else {
/*
- * A user mapping must ensure that context switch issues
- * CP_ABORT for this thread.
+ * Interrupt hanlder or fault window setup failed. Means
+ * NX can not generate fault for page fault. So not
+ * opening for user space tx window.
*/
- rc = set_thread_uses_vas();
- if (rc)
+ if (!vinst->virq) {
+ rc = -ENODEV;
goto free_window;
+ }
+
+ /*
+ * Window opened by a child thread may not be closed when
+ * it exits. So take reference to its pid and release it
+ * when the window is free by parent thread.
+ * Acquire a reference to the task's pid to make sure
+ * pid will not be re-used - needed only for multithread
+ * applications.
+ */
+ txwin->pid = get_task_pid(current, PIDTYPE_PID);
+ /*
+ * Acquire a reference to the task's mm.
+ */
+ txwin->mm = get_task_mm(current);
+
+ if (!txwin->mm) {
+ put_pid(txwin->pid);
+ pr_err("VAS: pid(%d): mm_struct is not found\n",
+ current->pid);
+ rc = -EPERM;
+ goto free_window;
+ }
+
+ mmgrab(txwin->mm);
+ mmput(txwin->mm);
+ mm_context_add_vas_window(txwin->mm);
+ /*
+ * Process closes window during exit. In the case of
+ * multithread application, the child thread can open
+ * window and can exit without closing it. Expects parent
+ * thread to use and close the window. So do not need
+ * to take pid reference for parent thread.
+ */
+ txwin->tgid = find_get_pid(task_tgid_vnr(current));
+ /*
+ * Even a process that has no foreign real address mapping can
+ * use an unpaired COPY instruction (to no real effect). Issue
+ * CP_ABORT to clear any pending COPY and prevent a covert
+ * channel.
+ *
+ * __switch_to() will issue CP_ABORT on future context switches
+ * if process / thread has any open VAS window (Use
+ * current->mm->context.vas_windows).
+ */
+ asm volatile(PPC_CP_ABORT);
}
set_vinst_win(vinst, txwin);
@@ -1128,6 +1192,7 @@ static void poll_window_credits(struct vas_window *window)
{
u64 val;
int creds, mode;
+ int count = 0;
val = read_hvwc_reg(window, VREG(WINCTL));
if (window->tx_win)
@@ -1146,10 +1211,27 @@ retry:
creds = GET_FIELD(VAS_LRX_WCRED, val);
}
+ /*
+ * Takes around few milliseconds to complete all pending requests
+ * and return credits.
+ * TODO: Scan fault FIFO and invalidate CRBs points to this window
+ * and issue CRB Kill to stop all pending requests. Need only
+ * if there is a bug in NX or fault handling in kernel.
+ */
if (creds < window->wcreds_max) {
val = 0;
set_current_state(TASK_UNINTERRUPTIBLE);
schedule_timeout(msecs_to_jiffies(10));
+ count++;
+ /*
+ * Process can not close send window until all credits are
+ * returned.
+ */
+ if (!(count % 1000))
+ pr_warn_ratelimited("VAS: pid %d stuck. Waiting for credits returned for Window(%d). creds %d, Retries %d\n",
+ vas_window_pid(window), window->winid,
+ creds, count);
+
goto retry;
}
}
@@ -1163,6 +1245,7 @@ static void poll_window_busy_state(struct vas_window *window)
{
int busy;
u64 val;
+ int count = 0;
retry:
val = read_hvwc_reg(window, VREG(WIN_STATUS));
@@ -1170,7 +1253,16 @@ retry:
if (busy) {
val = 0;
set_current_state(TASK_UNINTERRUPTIBLE);
- schedule_timeout(msecs_to_jiffies(5));
+ schedule_timeout(msecs_to_jiffies(10));
+ count++;
+ /*
+ * Takes around few milliseconds to process all pending
+ * requests.
+ */
+ if (!(count % 1000))
+ pr_warn_ratelimited("VAS: pid %d stuck. Window (ID=%d) is in busy state. Retries %d\n",
+ vas_window_pid(window), window->winid, count);
+
goto retry;
}
}
@@ -1235,22 +1327,118 @@ int vas_win_close(struct vas_window *window)
unmap_paste_region(window);
- clear_vinst_win(window);
-
poll_window_busy_state(window);
unpin_close_window(window);
poll_window_credits(window);
+ clear_vinst_win(window);
+
poll_window_castout(window);
/* if send window, drop reference to matching receive window */
- if (window->tx_win)
+ if (window->tx_win) {
+ if (window->user_win) {
+ /* Drop references to pid and mm */
+ put_pid(window->pid);
+ if (window->mm) {
+ mm_context_remove_vas_window(window->mm);
+ mmdrop(window->mm);
+ }
+ }
put_rx_win(window->rxwin);
+ }
vas_window_free(window);
return 0;
}
EXPORT_SYMBOL_GPL(vas_win_close);
+
+/*
+ * Return credit for the given window.
+ * Send windows and fault window uses credit mechanism as follows:
+ *
+ * Send windows:
+ * - The default number of credits available for each send window is
+ * 1024. It means 1024 requests can be issued asynchronously at the
+ * same time. If the credit is not available, that request will be
+ * returned with RMA_Busy.
+ * - One credit is taken when NX request is issued.
+ * - This credit is returned after NX processed that request.
+ * - If NX encounters translation error, kernel will return the
+ * credit on the specific send window after processing the fault CRB.
+ *
+ * Fault window:
+ * - The total number credits available is FIFO_SIZE/CRB_SIZE.
+ * Means 4MB/128 in the current implementation. If credit is not
+ * available, RMA_Reject is returned.
+ * - A credit is taken when NX pastes CRB in fault FIFO.
+ * - The kernel with return credit on fault window after reading entry
+ * from fault FIFO.
+ */
+void vas_return_credit(struct vas_window *window, bool tx)
+{
+ uint64_t val;
+
+ val = 0ULL;
+ if (tx) { /* send window */
+ val = SET_FIELD(VAS_TX_WCRED, val, 1);
+ write_hvwc_reg(window, VREG(TX_WCRED_ADDER), val);
+ } else {
+ val = SET_FIELD(VAS_LRX_WCRED, val, 1);
+ write_hvwc_reg(window, VREG(LRX_WCRED_ADDER), val);
+ }
+}
+
+struct vas_window *vas_pswid_to_window(struct vas_instance *vinst,
+ uint32_t pswid)
+{
+ struct vas_window *window;
+ int winid;
+
+ if (!pswid) {
+ pr_devel("%s: called for pswid 0!\n", __func__);
+ return ERR_PTR(-ESRCH);
+ }
+
+ decode_pswid(pswid, NULL, &winid);
+
+ if (winid >= VAS_WINDOWS_PER_CHIP)
+ return ERR_PTR(-ESRCH);
+
+ /*
+ * If application closes the window before the hardware
+ * returns the fault CRB, we should wait in vas_win_close()
+ * for the pending requests. so the window must be active
+ * and the process alive.
+ *
+ * If its a kernel process, we should not get any faults and
+ * should not get here.
+ */
+ window = vinst->windows[winid];
+
+ if (!window) {
+ pr_err("PSWID decode: Could not find window for winid %d pswid %d vinst 0x%p\n",
+ winid, pswid, vinst);
+ return NULL;
+ }
+
+ /*
+ * Do some sanity checks on the decoded window. Window should be
+ * NX GZIP user send window. FTW windows should not incur faults
+ * since their CRBs are ignored (not queued on FIFO or processed
+ * by NX).
+ */
+ if (!window->tx_win || !window->user_win || !window->nx_win ||
+ window->cop == VAS_COP_TYPE_FAULT ||
+ window->cop == VAS_COP_TYPE_FTW) {
+ pr_err("PSWID decode: id %d, tx %d, user %d, nx %d, cop %d\n",
+ winid, window->tx_win, window->user_win,
+ window->nx_win, window->cop);
+ WARN_ON(1);
+ }
+
+ return window;
+}
diff --git a/arch/powerpc/platforms/powernv/vas.c b/arch/powerpc/platforms/powernv/vas.c
index ed9cc6df329a..598e4cd563fb 100644
--- a/arch/powerpc/platforms/powernv/vas.c
+++ b/arch/powerpc/platforms/powernv/vas.c
@@ -14,7 +14,10 @@
#include <linux/of_platform.h>
#include <linux/of_address.h>
#include <linux/of.h>
+#include <linux/irqdomain.h>
+#include <linux/interrupt.h>
#include <asm/prom.h>
+#include <asm/xive.h>
#include "vas.h"
@@ -23,12 +26,37 @@ static LIST_HEAD(vas_instances);
static DEFINE_PER_CPU(int, cpu_vas_id);
+static int vas_irq_fault_window_setup(struct vas_instance *vinst)
+{
+ char devname[64];
+ int rc = 0;
+
+ snprintf(devname, sizeof(devname), "vas-%d", vinst->vas_id);
+ rc = request_threaded_irq(vinst->virq, vas_fault_handler,
+ vas_fault_thread_fn, 0, devname, vinst);
+
+ if (rc) {
+ pr_err("VAS[%d]: Request IRQ(%d) failed with %d\n",
+ vinst->vas_id, vinst->virq, rc);
+ goto out;
+ }
+
+ rc = vas_setup_fault_window(vinst);
+ if (rc)
+ free_irq(vinst->virq, vinst);
+
+out:
+ return rc;
+}
+
static int init_vas_instance(struct platform_device *pdev)
{
- int rc, cpu, vasid;
- struct resource *res;
- struct vas_instance *vinst;
struct device_node *dn = pdev->dev.of_node;
+ struct vas_instance *vinst;
+ struct xive_irq_data *xd;
+ uint32_t chipid, hwirq;
+ struct resource *res;
+ int rc, cpu, vasid;
rc = of_property_read_u32(dn, "ibm,vas-id", &vasid);
if (rc) {
@@ -36,6 +64,12 @@ static int init_vas_instance(struct platform_device *pdev)
return -ENODEV;
}
+ rc = of_property_read_u32(dn, "ibm,chip-id", &chipid);
+ if (rc) {
+ pr_err("No ibm,chip-id property for %s?\n", pdev->name);
+ return -ENODEV;
+ }
+
if (pdev->num_resources != 4) {
pr_err("Unexpected DT configuration for [%s, %d]\n",
pdev->name, vasid);
@@ -69,9 +103,32 @@ static int init_vas_instance(struct platform_device *pdev)
vinst->paste_win_id_shift = 63 - res->end;
- pr_devel("Initialized instance [%s, %d], paste_base 0x%llx, "
- "paste_win_id_shift 0x%llx\n", pdev->name, vasid,
- vinst->paste_base_addr, vinst->paste_win_id_shift);
+ hwirq = xive_native_alloc_irq_on_chip(chipid);
+ if (!hwirq) {
+ pr_err("Inst%d: Unable to allocate global irq for chip %d\n",
+ vinst->vas_id, chipid);
+ return -ENOENT;
+ }
+
+ vinst->virq = irq_create_mapping(NULL, hwirq);
+ if (!vinst->virq) {
+ pr_err("Inst%d: Unable to map global irq %d\n",
+ vinst->vas_id, hwirq);
+ return -EINVAL;
+ }
+
+ xd = irq_get_handler_data(vinst->virq);
+ if (!xd) {
+ pr_err("Inst%d: Invalid virq %d\n",
+ vinst->vas_id, vinst->virq);
+ return -EINVAL;
+ }
+
+ vinst->irq_port = xd->trig_page;
+ pr_devel("Initialized instance [%s, %d] paste_base 0x%llx paste_win_id_shift 0x%llx IRQ %d Port 0x%llx\n",
+ pdev->name, vasid, vinst->paste_base_addr,
+ vinst->paste_win_id_shift, vinst->virq,
+ vinst->irq_port);
for_each_possible_cpu(cpu) {
if (cpu_to_chip_id(cpu) == of_get_ibm_chip_id(dn))
@@ -82,6 +139,22 @@ static int init_vas_instance(struct platform_device *pdev)
list_add(&vinst->node, &vas_instances);
mutex_unlock(&vas_mutex);
+ spin_lock_init(&vinst->fault_lock);
+ /*
+ * IRQ and fault handling setup is needed only for user space
+ * send windows.
+ */
+ if (vinst->virq) {
+ rc = vas_irq_fault_window_setup(vinst);
+ /*
+ * Fault window is used only for user space send windows.
+ * So if vinst->virq is NULL, tx_win_open returns -ENODEV
+ * for user space.
+ */
+ if (rc)
+ vinst->virq = 0;
+ }
+
vas_instance_init_dbgdir(vinst);
dev_set_drvdata(&pdev->dev, vinst);
diff --git a/arch/powerpc/platforms/powernv/vas.h b/arch/powerpc/platforms/powernv/vas.h
index 5574aec9ee88..70f793e8f6cc 100644
--- a/arch/powerpc/platforms/powernv/vas.h
+++ b/arch/powerpc/platforms/powernv/vas.h
@@ -101,11 +101,9 @@
/*
* Initial per-process credits.
* Max send window credits: 4K-1 (12-bits in VAS_TX_WCRED)
- * Max receive window credits: 64K-1 (16 bits in VAS_LRX_WCRED)
*
* TODO: Needs tuning for per-process credits
*/
-#define VAS_RX_WCREDS_MAX ((64 << 10) - 1)
#define VAS_TX_WCREDS_MAX ((4 << 10) - 1)
#define VAS_WCREDS_DEFAULT (1 << 10)
@@ -296,6 +294,22 @@ enum vas_notify_after_count {
};
/*
+ * NX can generate an interrupt for multiple faults and expects kernel
+ * to process all of them. So read all valid CRB entries until find the
+ * invalid one. So use pswid which is pasted by NX and ccw[0] (reserved
+ * bit in BE) to check valid CRB. CCW[0] will not be touched by user
+ * space. Application gets CRB formt error if it updates this bit.
+ *
+ * Invalidate FIFO during allocation and process all entries from last
+ * successful read until finds invalid pswid and ccw[0] values.
+ * After reading each CRB entry from fault FIFO, the kernel invalidate
+ * it by updating pswid with FIFO_INVALID_ENTRY and CCW[0] with
+ * CCW0_INVALID.
+ */
+#define FIFO_INVALID_ENTRY 0xffffffff
+#define CCW0_INVALID 1
+
+/*
* One per instance of VAS. Each instance will have a separate set of
* receive windows, one per coprocessor type.
*
@@ -313,6 +327,15 @@ struct vas_instance {
u64 paste_base_addr;
u64 paste_win_id_shift;
+ u64 irq_port;
+ int virq;
+ int fault_crbs;
+ int fault_fifo_size;
+ int fifo_in_progress; /* To wake up thread or return IRQ_HANDLED */
+ spinlock_t fault_lock; /* Protects fifo_in_progress update */
+ void *fault_fifo;
+ struct vas_window *fault_win; /* Fault window */
+
struct mutex mutex;
struct vas_window *rxwin[VAS_COP_TYPE_MAX];
struct vas_window *windows[VAS_WINDOWS_PER_CHIP];
@@ -333,7 +356,9 @@ struct vas_window {
bool user_win; /* True if user space window */
void *hvwc_map; /* HV window context */
void *uwc_map; /* OS/User window context */
- pid_t pid; /* Linux process id of owner */
+ struct pid *pid; /* Linux process id of owner */
+ struct pid *tgid; /* Thread group ID of owner */
+ struct mm_struct *mm; /* Linux process mm_struct */
int wcreds_max; /* Window credits */
char *dbgname;
@@ -406,6 +431,19 @@ extern void vas_init_dbgdir(void);
extern void vas_instance_init_dbgdir(struct vas_instance *vinst);
extern void vas_window_init_dbgdir(struct vas_window *win);
extern void vas_window_free_dbgdir(struct vas_window *win);
+extern int vas_setup_fault_window(struct vas_instance *vinst);
+extern irqreturn_t vas_fault_thread_fn(int irq, void *data);
+extern irqreturn_t vas_fault_handler(int irq, void *dev_id);
+extern void vas_return_credit(struct vas_window *window, bool tx);
+extern struct vas_window *vas_pswid_to_window(struct vas_instance *vinst,
+ uint32_t pswid);
+extern void vas_win_paste_addr(struct vas_window *window, u64 *addr,
+ int *len);
+
+static inline int vas_window_pid(struct vas_window *window)
+{
+ return pid_vnr(window->pid);
+}
static inline void vas_log_write(struct vas_window *win, char *name,
void *regptr, u64 val)
@@ -444,6 +482,21 @@ static inline u64 read_hvwc_reg(struct vas_window *win,
return in_be64(win->hvwc_map+reg);
}
+/*
+ * Encode/decode the Partition Send Window ID (PSWID) for a window in
+ * a way that we can uniquely identify any window in the system. i.e.
+ * we should be able to locate the 'struct vas_window' given the PSWID.
+ *
+ * Bits Usage
+ * 0:7 VAS id (8 bits)
+ * 8:15 Unused, 0 (3 bits)
+ * 16:31 Window id (16 bits)
+ */
+static inline u32 encode_pswid(int vasid, int winid)
+{
+ return ((u32)winid | (vasid << (31 - 7)));
+}
+
static inline void decode_pswid(u32 pswid, int *vasid, int *winid)
{
if (vasid)