From a5127a2dbe95dd72b6a21c98dee0857511f30357 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Fri, 31 Jan 2020 17:59:08 +0100
Subject: drm/tegra: sor: Suspend on clock registration failure

Make sure the SOR module is suspenden after we fail to register the SOR
pad output clock.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/tegra/sor.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'drivers/gpu/drm')

diff --git a/drivers/gpu/drm/tegra/sor.c b/drivers/gpu/drm/tegra/sor.c
index f884185c5e9f..30c96b15d7a3 100644
--- a/drivers/gpu/drm/tegra/sor.c
+++ b/drivers/gpu/drm/tegra/sor.c
@@ -3921,15 +3921,16 @@ static int tegra_sor_probe(struct platform_device *pdev)
 	if (!sor->clk_pad) {
 		char *name;
 
-		err = host1x_client_resume(&sor->client);
-		if (err < 0) {
-			dev_err(sor->dev, "failed to resume: %d\n", err);
+		name = devm_kasprintf(sor->dev, GFP_KERNEL, "sor%u_pad_clkout",
+				      sor->index);
+		if (!name) {
+			err = -ENOMEM;
 			goto remove;
 		}
 
-		name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "sor%u_pad_clkout", sor->index);
-		if (!name) {
-			err = -ENOMEM;
+		err = host1x_client_resume(&sor->client);
+		if (err < 0) {
+			dev_err(sor->dev, "failed to resume: %d\n", err);
 			goto remove;
 		}
 
-- 
cgit v1.2.3-59-g8ed1b


From ad2139cb80a7a9afbfe428d0448d351a84e22ee6 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Fri, 31 Jan 2020 17:59:09 +0100
Subject: drm/tegra: sor: Disable runtime PM on probe failure

If the driver fails to probe, make sure to disable runtime PM again.

While at it, make the cleanup code in ->remove() symmetric.

Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/tegra/sor.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'drivers/gpu/drm')

diff --git a/drivers/gpu/drm/tegra/sor.c b/drivers/gpu/drm/tegra/sor.c
index 30c96b15d7a3..e79dd65c1b54 100644
--- a/drivers/gpu/drm/tegra/sor.c
+++ b/drivers/gpu/drm/tegra/sor.c
@@ -3925,13 +3925,13 @@ static int tegra_sor_probe(struct platform_device *pdev)
 				      sor->index);
 		if (!name) {
 			err = -ENOMEM;
-			goto remove;
+			goto rpm_disable;
 		}
 
 		err = host1x_client_resume(&sor->client);
 		if (err < 0) {
 			dev_err(sor->dev, "failed to resume: %d\n", err);
-			goto remove;
+			goto rpm_disable;
 		}
 
 		sor->clk_pad = tegra_clk_sor_pad_register(sor, name);
@@ -3942,7 +3942,7 @@ static int tegra_sor_probe(struct platform_device *pdev)
 		err = PTR_ERR(sor->clk_pad);
 		dev_err(&pdev->dev, "failed to register SOR pad clock: %d\n",
 			err);
-		goto remove;
+		goto rpm_disable;
 	}
 
 	INIT_LIST_HEAD(&sor->client.list);
@@ -3953,11 +3953,13 @@ static int tegra_sor_probe(struct platform_device *pdev)
 	if (err < 0) {
 		dev_err(&pdev->dev, "failed to register host1x client: %d\n",
 			err);
-		goto remove;
+		goto rpm_disable;
 	}
 
 	return 0;
 
+rpm_disable:
+	pm_runtime_disable(&pdev->dev);
 remove:
 	if (sor->ops && sor->ops->remove)
 		sor->ops->remove(sor);
@@ -3971,8 +3973,6 @@ static int tegra_sor_remove(struct platform_device *pdev)
 	struct tegra_sor *sor = platform_get_drvdata(pdev);
 	int err;
 
-	pm_runtime_disable(&pdev->dev);
-
 	err = host1x_client_unregister(&sor->client);
 	if (err < 0) {
 		dev_err(&pdev->dev, "failed to unregister host1x client: %d\n",
@@ -3980,6 +3980,8 @@ static int tegra_sor_remove(struct platform_device *pdev)
 		return err;
 	}
 
+	pm_runtime_disable(&pdev->dev);
+
 	if (sor->ops && sor->ops->remove) {
 		err = sor->ops->remove(sor);
 		if (err < 0)
-- 
cgit v1.2.3-59-g8ed1b


From c472a0b0a1fd1688157b4ad6efc1c3fb8e571a53 Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Fri, 31 Jan 2020 17:59:10 +0100
Subject: drm/tegra: sor: Initialize runtime PM before use

Commit fd67e9c6ed5a ("drm/tegra: Do not implement runtime PM") replaced
the generic runtime PM usage by a host1x bus-specific implementation in
order to work around some assumptions baked into runtime PM that are in
conflict with the requirements in the Tegra DRM driver.

Unfortunately the new runtime PM callbacks are not setup yet at the time
when the SOR driver first needs to resume the device to register the SOR
pad clock, and accesses to register will cause the system to hang.

Note that this only happens on Tegra124 and Tegra210 because those are
the only SoCs where the SOR pad clock is registered from the SOR driver.
Later generations use a SOR pad clock provided by the BPMP.

Fix this by moving the registration of the SOR pad clock after the
host1x client has been registered. That's somewhat suboptimal because
this could potentially, though it's very unlikely, cause the Tegra DRM
to be probed if the SOR happens to be the last subdevice to register,
only to be immediately removed again if the SOR pad output clock fails
to register. That's just a minor annoyance, though, and doesn't justify
implementing a workaround.

Fixes: fd67e9c6ed5a ("drm/tegra: Do not implement runtime PM")
Signed-off-by: Thierry Reding <treding@nvidia.com>
---
 drivers/gpu/drm/tegra/sor.c | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

(limited to 'drivers/gpu/drm')

diff --git a/drivers/gpu/drm/tegra/sor.c b/drivers/gpu/drm/tegra/sor.c
index e79dd65c1b54..a9222841862e 100644
--- a/drivers/gpu/drm/tegra/sor.c
+++ b/drivers/gpu/drm/tegra/sor.c
@@ -3914,6 +3914,17 @@ static int tegra_sor_probe(struct platform_device *pdev)
 	platform_set_drvdata(pdev, sor);
 	pm_runtime_enable(&pdev->dev);
 
+	INIT_LIST_HEAD(&sor->client.list);
+	sor->client.ops = &sor_client_ops;
+	sor->client.dev = &pdev->dev;
+
+	err = host1x_client_register(&sor->client);
+	if (err < 0) {
+		dev_err(&pdev->dev, "failed to register host1x client: %d\n",
+			err);
+		goto rpm_disable;
+	}
+
 	/*
 	 * On Tegra210 and earlier, provide our own implementation for the
 	 * pad output clock.
@@ -3925,13 +3936,13 @@ static int tegra_sor_probe(struct platform_device *pdev)
 				      sor->index);
 		if (!name) {
 			err = -ENOMEM;
-			goto rpm_disable;
+			goto unregister;
 		}
 
 		err = host1x_client_resume(&sor->client);
 		if (err < 0) {
 			dev_err(sor->dev, "failed to resume: %d\n", err);
-			goto rpm_disable;
+			goto unregister;
 		}
 
 		sor->clk_pad = tegra_clk_sor_pad_register(sor, name);
@@ -3940,24 +3951,15 @@ static int tegra_sor_probe(struct platform_device *pdev)
 
 	if (IS_ERR(sor->clk_pad)) {
 		err = PTR_ERR(sor->clk_pad);
-		dev_err(&pdev->dev, "failed to register SOR pad clock: %d\n",
+		dev_err(sor->dev, "failed to register SOR pad clock: %d\n",
 			err);
-		goto rpm_disable;
-	}
-
-	INIT_LIST_HEAD(&sor->client.list);
-	sor->client.ops = &sor_client_ops;
-	sor->client.dev = &pdev->dev;
-
-	err = host1x_client_register(&sor->client);
-	if (err < 0) {
-		dev_err(&pdev->dev, "failed to register host1x client: %d\n",
-			err);
-		goto rpm_disable;
+		goto unregister;
 	}
 
 	return 0;
 
+unregister:
+	host1x_client_unregister(&sor->client);
 rpm_disable:
 	pm_runtime_disable(&pdev->dev);
 remove:
-- 
cgit v1.2.3-59-g8ed1b


From 2d9384ff91770a71bd1ff24c25952ef1187a0e9c Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Tue, 4 Feb 2020 14:59:24 +0100
Subject: drm/tegra: Relax IOMMU usage criteria on old Tegra

Older Tegra devices only allow addressing 32 bits of memory, so whether
or not the host1x is attached to an IOMMU doesn't matter. host1x IOMMU
attachment is only needed on devices that can address memory beyond the
32-bit boundary and where the host1x doesn't support the wide GATHER
opcode that allows it to access buffers at higher addresses.

Cc: <stable@vger.kernel.org> # v5.5
Signed-off-by: Thierry Reding <treding@nvidia.com>
Tested-by: Dmitry Osipenko <digetx@gmail.com>
Reviewed-by: Dmitry Osipenko <digetx@gmail.com>
---
 drivers/gpu/drm/tegra/drm.c | 49 +++++++++++++++++++++++++++++----------------
 1 file changed, 32 insertions(+), 17 deletions(-)

(limited to 'drivers/gpu/drm')

diff --git a/drivers/gpu/drm/tegra/drm.c b/drivers/gpu/drm/tegra/drm.c
index aa9e49f04988..bd268028fb3d 100644
--- a/drivers/gpu/drm/tegra/drm.c
+++ b/drivers/gpu/drm/tegra/drm.c
@@ -1037,23 +1037,9 @@ void tegra_drm_free(struct tegra_drm *tegra, size_t size, void *virt,
 	free_pages((unsigned long)virt, get_order(size));
 }
 
-static int host1x_drm_probe(struct host1x_device *dev)
+static bool host1x_drm_wants_iommu(struct host1x_device *dev)
 {
-	struct drm_driver *driver = &tegra_drm_driver;
 	struct iommu_domain *domain;
-	struct tegra_drm *tegra;
-	struct drm_device *drm;
-	int err;
-
-	drm = drm_dev_alloc(driver, &dev->dev);
-	if (IS_ERR(drm))
-		return PTR_ERR(drm);
-
-	tegra = kzalloc(sizeof(*tegra), GFP_KERNEL);
-	if (!tegra) {
-		err = -ENOMEM;
-		goto put;
-	}
 
 	/*
 	 * If the Tegra DRM clients are backed by an IOMMU, push buffers are
@@ -1082,9 +1068,38 @@ static int host1x_drm_probe(struct host1x_device *dev)
 	 * up the device tree appropriately. This is considered an problem
 	 * of integration, so care must be taken for the DT to be consistent.
 	 */
-	domain = iommu_get_domain_for_dev(drm->dev->parent);
+	domain = iommu_get_domain_for_dev(dev->dev.parent);
+
+	/*
+	 * Tegra20 and Tegra30 don't support addressing memory beyond the
+	 * 32-bit boundary, so the regular GATHER opcodes will always be
+	 * sufficient and whether or not the host1x is attached to an IOMMU
+	 * doesn't matter.
+	 */
+	if (!domain && dma_get_mask(dev->dev.parent) <= DMA_BIT_MASK(32))
+		return true;
+
+	return domain != NULL;
+}
+
+static int host1x_drm_probe(struct host1x_device *dev)
+{
+	struct drm_driver *driver = &tegra_drm_driver;
+	struct tegra_drm *tegra;
+	struct drm_device *drm;
+	int err;
+
+	drm = drm_dev_alloc(driver, &dev->dev);
+	if (IS_ERR(drm))
+		return PTR_ERR(drm);
+
+	tegra = kzalloc(sizeof(*tegra), GFP_KERNEL);
+	if (!tegra) {
+		err = -ENOMEM;
+		goto put;
+	}
 
-	if (domain && iommu_present(&platform_bus_type)) {
+	if (host1x_drm_wants_iommu(dev) && iommu_present(&platform_bus_type)) {
 		tegra->domain = iommu_domain_alloc(&platform_bus_type);
 		if (!tegra->domain) {
 			err = -ENOMEM;
-- 
cgit v1.2.3-59-g8ed1b


From 273da5a046965ccf0ec79eb63f2d5173467e20fa Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Tue, 4 Feb 2020 14:59:25 +0100
Subject: drm/tegra: Reuse IOVA mapping where possible

This partially reverts the DMA API support that was recently merged
because it was causing performance regressions on older Tegra devices.
Unfortunately, the cache maintenance performed by dma_map_sg() and
dma_unmap_sg() causes performance to drop by a factor of 10.

The right solution for this would be to cache mappings for buffers per
consumer device, but that's a bit involved. Instead, we simply revert to
the old behaviour of sharing IOVA mappings when we know that devices can
do so (i.e. they share the same IOMMU domain).

Cc: <stable@vger.kernel.org> # v5.5
Reported-by: Dmitry Osipenko <digetx@gmail.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
Tested-by: Dmitry Osipenko <digetx@gmail.com>
Reviewed-by: Dmitry Osipenko <digetx@gmail.com>
---
 drivers/gpu/drm/tegra/gem.c   | 10 +++++++++-
 drivers/gpu/drm/tegra/plane.c | 44 ++++++++++++++++++++++++-------------------
 drivers/gpu/host1x/job.c      | 32 ++++++++++++++++++++++++++++---
 3 files changed, 63 insertions(+), 23 deletions(-)

(limited to 'drivers/gpu/drm')

diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c
index bc15b430156d..c46b4d4190ac 100644
--- a/drivers/gpu/drm/tegra/gem.c
+++ b/drivers/gpu/drm/tegra/gem.c
@@ -60,8 +60,16 @@ static struct sg_table *tegra_bo_pin(struct device *dev, struct host1x_bo *bo,
 	/*
 	 * If we've manually mapped the buffer object through the IOMMU, make
 	 * sure to return the IOVA address of our mapping.
+	 *
+	 * Similarly, for buffers that have been allocated by the DMA API the
+	 * physical address can be used for devices that are not attached to
+	 * an IOMMU. For these devices, callers must pass a valid pointer via
+	 * the @phys argument.
+	 *
+	 * Imported buffers were also already mapped at import time, so the
+	 * existing mapping can be reused.
 	 */
-	if (phys && obj->mm) {
+	if (phys) {
 		*phys = obj->iova;
 		return NULL;
 	}
diff --git a/drivers/gpu/drm/tegra/plane.c b/drivers/gpu/drm/tegra/plane.c
index cadcdd9ea427..9ccfb56e9b01 100644
--- a/drivers/gpu/drm/tegra/plane.c
+++ b/drivers/gpu/drm/tegra/plane.c
@@ -3,6 +3,8 @@
  * Copyright (C) 2017 NVIDIA CORPORATION.  All rights reserved.
  */
 
+#include <linux/iommu.h>
+
 #include <drm/drm_atomic.h>
 #include <drm/drm_atomic_helper.h>
 #include <drm/drm_fourcc.h>
@@ -107,21 +109,27 @@ const struct drm_plane_funcs tegra_plane_funcs = {
 
 static int tegra_dc_pin(struct tegra_dc *dc, struct tegra_plane_state *state)
 {
+	struct iommu_domain *domain = iommu_get_domain_for_dev(dc->dev);
 	unsigned int i;
 	int err;
 
 	for (i = 0; i < state->base.fb->format->num_planes; i++) {
 		struct tegra_bo *bo = tegra_fb_get_plane(state->base.fb, i);
+		dma_addr_t phys_addr, *phys;
+		struct sg_table *sgt;
 
-		if (!dc->client.group) {
-			struct sg_table *sgt;
+		if (!domain || dc->client.group)
+			phys = &phys_addr;
+		else
+			phys = NULL;
 
-			sgt = host1x_bo_pin(dc->dev, &bo->base, NULL);
-			if (IS_ERR(sgt)) {
-				err = PTR_ERR(sgt);
-				goto unpin;
-			}
+		sgt = host1x_bo_pin(dc->dev, &bo->base, phys);
+		if (IS_ERR(sgt)) {
+			err = PTR_ERR(sgt);
+			goto unpin;
+		}
 
+		if (sgt) {
 			err = dma_map_sg(dc->dev, sgt->sgl, sgt->nents,
 					 DMA_TO_DEVICE);
 			if (err == 0) {
@@ -143,7 +151,7 @@ static int tegra_dc_pin(struct tegra_dc *dc, struct tegra_plane_state *state)
 			state->iova[i] = sg_dma_address(sgt->sgl);
 			state->sgt[i] = sgt;
 		} else {
-			state->iova[i] = bo->iova;
+			state->iova[i] = phys_addr;
 		}
 	}
 
@@ -156,9 +164,11 @@ unpin:
 		struct tegra_bo *bo = tegra_fb_get_plane(state->base.fb, i);
 		struct sg_table *sgt = state->sgt[i];
 
-		dma_unmap_sg(dc->dev, sgt->sgl, sgt->nents, DMA_TO_DEVICE);
-		host1x_bo_unpin(dc->dev, &bo->base, sgt);
+		if (sgt)
+			dma_unmap_sg(dc->dev, sgt->sgl, sgt->nents,
+				     DMA_TO_DEVICE);
 
+		host1x_bo_unpin(dc->dev, &bo->base, sgt);
 		state->iova[i] = DMA_MAPPING_ERROR;
 		state->sgt[i] = NULL;
 	}
@@ -172,17 +182,13 @@ static void tegra_dc_unpin(struct tegra_dc *dc, struct tegra_plane_state *state)
 
 	for (i = 0; i < state->base.fb->format->num_planes; i++) {
 		struct tegra_bo *bo = tegra_fb_get_plane(state->base.fb, i);
+		struct sg_table *sgt = state->sgt[i];
 
-		if (!dc->client.group) {
-			struct sg_table *sgt = state->sgt[i];
-
-			if (sgt) {
-				dma_unmap_sg(dc->dev, sgt->sgl, sgt->nents,
-					     DMA_TO_DEVICE);
-				host1x_bo_unpin(dc->dev, &bo->base, sgt);
-			}
-		}
+		if (sgt)
+			dma_unmap_sg(dc->dev, sgt->sgl, sgt->nents,
+				     DMA_TO_DEVICE);
 
+		host1x_bo_unpin(dc->dev, &bo->base, sgt);
 		state->iova[i] = DMA_MAPPING_ERROR;
 		state->sgt[i] = NULL;
 	}
diff --git a/drivers/gpu/host1x/job.c b/drivers/gpu/host1x/job.c
index 25ca54de8fc5..0d53c08e9972 100644
--- a/drivers/gpu/host1x/job.c
+++ b/drivers/gpu/host1x/job.c
@@ -8,6 +8,7 @@
 #include <linux/dma-mapping.h>
 #include <linux/err.h>
 #include <linux/host1x.h>
+#include <linux/iommu.h>
 #include <linux/kref.h>
 #include <linux/module.h>
 #include <linux/scatterlist.h>
@@ -101,9 +102,11 @@ static unsigned int pin_job(struct host1x *host, struct host1x_job *job)
 {
 	struct host1x_client *client = job->client;
 	struct device *dev = client->dev;
+	struct iommu_domain *domain;
 	unsigned int i;
 	int err;
 
+	domain = iommu_get_domain_for_dev(dev);
 	job->num_unpins = 0;
 
 	for (i = 0; i < job->num_relocs; i++) {
@@ -117,7 +120,19 @@ static unsigned int pin_job(struct host1x *host, struct host1x_job *job)
 			goto unpin;
 		}
 
-		if (client->group)
+		/*
+		 * If the client device is not attached to an IOMMU, the
+		 * physical address of the buffer object can be used.
+		 *
+		 * Similarly, when an IOMMU domain is shared between all
+		 * host1x clients, the IOVA is already available, so no
+		 * need to map the buffer object again.
+		 *
+		 * XXX Note that this isn't always safe to do because it
+		 * relies on an assumption that no cache maintenance is
+		 * needed on the buffer objects.
+		 */
+		if (!domain || client->group)
 			phys = &phys_addr;
 		else
 			phys = NULL;
@@ -176,6 +191,7 @@ static unsigned int pin_job(struct host1x *host, struct host1x_job *job)
 		dma_addr_t phys_addr;
 		unsigned long shift;
 		struct iova *alloc;
+		dma_addr_t *phys;
 		unsigned int j;
 
 		g->bo = host1x_bo_get(g->bo);
@@ -184,7 +200,17 @@ static unsigned int pin_job(struct host1x *host, struct host1x_job *job)
 			goto unpin;
 		}
 
-		sgt = host1x_bo_pin(host->dev, g->bo, NULL);
+		/**
+		 * If the host1x is not attached to an IOMMU, there is no need
+		 * to map the buffer object for the host1x, since the physical
+		 * address can simply be used.
+		 */
+		if (!iommu_get_domain_for_dev(host->dev))
+			phys = &phys_addr;
+		else
+			phys = NULL;
+
+		sgt = host1x_bo_pin(host->dev, g->bo, phys);
 		if (IS_ERR(sgt)) {
 			err = PTR_ERR(sgt);
 			goto unpin;
@@ -214,7 +240,7 @@ static unsigned int pin_job(struct host1x *host, struct host1x_job *job)
 
 			job->unpins[job->num_unpins].size = gather_size;
 			phys_addr = iova_dma_addr(&host->iova, alloc);
-		} else {
+		} else if (sgt) {
 			err = dma_map_sg(host->dev, sgt->sgl, sgt->nents,
 					 DMA_TO_DEVICE);
 			if (!err) {
-- 
cgit v1.2.3-59-g8ed1b