From b6485bed40d7859735bdbfedbd55dcc8366a88a7 Mon Sep 17 00:00:00 2001 From: Tao Zhou Date: Mon, 6 Dec 2021 15:54:54 +0800 Subject: drm/amdkfd: reset queue which consumes RAS poison (v2) CP supports unmap queue with reset mode which only destroys specific queue without affecting others. Replacing whole gpu reset with reset queue mode for RAS poison consumption saves much time, and we can also fallback to gpu reset solution if reset queue fails. v2: Return directly if process is NULL; Reset queue solution is not applicable to SDMA, fallback to legacy way; Call kfd_unref_process after lookup process. Signed-off-by: Tao Zhou Reviewed-by: Hawking Zhang Acked-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 46cf48b3904a..0bf09a94d944 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -721,13 +721,13 @@ bool amdgpu_amdkfd_have_atomics_support(struct amdgpu_device *adev) return adev->have_atomics_support; } -void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev) +void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev, bool reset) { struct ras_err_data err_data = {0, 0, 0, NULL}; /* CPU MCA will handle page retirement if connected_to_cpu is 1 */ if (!adev->gmc.xgmi.connected_to_cpu) - amdgpu_umc_process_ras_data_cb(adev, &err_data, NULL); - else + amdgpu_umc_do_page_retirement(adev, &err_data, NULL, reset); + else if (reset) amdgpu_amdkfd_gpu_reset(adev); } -- cgit v1.2.3-59-g8ed1b