drm/amdgpu: Avoid HW GPU reset for RAS.

Problem: Under certain conditions, when some IP bocks take a RAS error, we can get into a situation where a GPU reset is not possible due to issues in RAS in SMU/PSP. Temporary fix until proper solution in PSP/SMU is ready: When uncorrectable error happens the DF will unconditionally broadcast error event packets to all its clients/slave upon receiving fatal error event and freeze all its outbound queues, err_event_athub interrupt will be triggered. In such case and we use this interrupt to issue GPU reset. THe GPU reset code is modified for such case to avoid HW reset, only stops schedulers, deatches all in progress and not yet scheduled job's fences, set error code on them and signals. Also reject any new incoming job submissions from user space. All this is done to notify the applications of the problem. v2: Extract amdgpu_amdkfd_pre/post_reset from amdgpu_device_lock/unlock_adev Move amdgpu_job_stop_all_jobs_on_sched to amdgpu_job.c Remove print param from amdgpu_ras_query_error_count v3: Update based on prevoius bug fixing patch to properly call amdgpu_amdkfd_pre_reset for other XGMI hive memebers. Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Acked-by: Felix Kuehling <Felix.Kuehling@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
author: Andrey Grodzovsky <andrey.grodzovsky@amd.com> 2019-09-13 17:40:32 -0500
committer: Alex Deucher <alexander.deucher@amd.com> 2019-09-13 17:41:05 -0500
commit: 7c6e68c777f109484559a35b125a773439bbd319 (patch)
tree: aca19eee2211f8b1684a9ef789a53fe5c8ba1982 /drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
parent: drm/amdgpu: Fix bugs in amdgpu_device_gpu_recover in XGMI case. (diff)
download: linux-dev-7c6e68c777f109484559a35b125a773439bbd319.tar.xz
linux-dev-7c6e68c777f109484559a35b125a773439bbd319.zip
1 files changed, 38 insertions, 0 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 9d76e0923a5a..e1bad992e83b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -246,6 +246,44 @@ static struct dma_fence *amdgpu_job_run(struct drm_sched_job *sched_job)
 	return fence;
 }
 
+#define to_drm_sched_job(sched_job)		\
+		container_of((sched_job), struct drm_sched_job, queue_node)
+
+void amdgpu_job_stop_all_jobs_on_sched(struct drm_gpu_scheduler *sched)
+{
+	struct drm_sched_job *s_job;
+	struct drm_sched_entity *s_entity = NULL;
+	int i;
+
+	/* Signal all jobs not yet scheduled */
+	for (i = DRM_SCHED_PRIORITY_MAX - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
+		struct drm_sched_rq *rq = &sched->sched_rq[i];
+
+		if (!rq)
+			continue;
+
+		spin_lock(&rq->lock);
+		list_for_each_entry(s_entity, &rq->entities, list) {
+			while ((s_job = to_drm_sched_job(spsc_queue_pop(&s_entity->job_queue)))) {
+				struct drm_sched_fence *s_fence = s_job->s_fence;
+
+				dma_fence_signal(&s_fence->scheduled);
+				dma_fence_set_error(&s_fence->finished, -EHWPOISON);
+				dma_fence_signal(&s_fence->finished);
+			}
+		}
+		spin_unlock(&rq->lock);
+	}
+
+	/* Signal all jobs already scheduled to HW */
+	list_for_each_entry(s_job, &sched->ring_mirror_list, node) {
+		struct drm_sched_fence *s_fence = s_job->s_fence;
+
+		dma_fence_set_error(&s_fence->finished, -EHWPOISON);
+		dma_fence_signal(&s_fence->finished);
+	}
+}
+
 const struct drm_sched_backend_ops amdgpu_sched_ops = {
 	.dependency = amdgpu_job_dependency,
 	.run_job = amdgpu_job_run,
author	Andrey Grodzovsky <andrey.grodzovsky@amd.com>	2019-09-13 17:40:32 -0500
committer	Alex Deucher <alexander.deucher@amd.com>	2019-09-13 17:41:05 -0500
commit	7c6e68c777f109484559a35b125a773439bbd319 (patch)
tree	aca19eee2211f8b1684a9ef789a53fe5c8ba1982 /drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
parent	drm/amdgpu: Fix bugs in amdgpu_device_gpu_recover in XGMI case. (diff)
download	linux-dev-7c6e68c777f109484559a35b125a773439bbd319.tar.xz linux-dev-7c6e68c777f109484559a35b125a773439bbd319.zip