aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYiPeng Chai <YiPeng.Chai@amd.com>2022-09-08 09:44:36 +0800
committerAlex Deucher <alexander.deucher@amd.com>2022-09-19 15:17:47 -0400
commit83d29a5f8a5a8ac76fdf8b8ccca65899345e6a9e (patch)
treeaaaa3c7b526854b3a0793dbedd710dfc87c7bdcf
parentdrm/amdgpu: Adjust removal control flow for smu v13_0_2 (diff)
downloadlinux-dev-83d29a5f8a5a8ac76fdf8b8ccca65899345e6a9e.tar.xz
linux-dev-83d29a5f8a5a8ac76fdf8b8ccca65899345e6a9e.zip
drm/amdgpu: Fixed psp fence and memory issues when removing amdgpu device
V3: Fixed psp fence and memory issues for the asic using smu v13_0_2 when removing amdgpu device. [Why]: 1. psp_suspend->psp_free_shared_bufs-> psp_ta_free_shared_buf-> amdgpu_bo_free_kernel-> ...->amdgpu_bo_release_notify-> amdgpu_fill_buffer psp will free vram memory used by psp when psp_suspend is called. But for the asic using smu v13_0_2, because psp_suspend is called before adev->shutdown is set to true when removing the first hive device, amdgpu fill_buffer will be called, which will cause fence issues when evicting all vram resources in amdgpu vram mgr_fini. 2. Since psp_hw_fini is not called after calling psp_suspend and psp_suspend only calls psp_ring_stop, the psp ring memory will not be released when amdgpu device is removed. [How]: 1. Set shutdown to true before calling amdgpu_device_gpu_recover, then amdgpu_fill_buffer will not be called when psp_suspend is called. 2. Free psp ring memory in psp_sw_fini. Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_device.c5
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c1
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c5
3 files changed, 10 insertions, 1 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index c268bd033064..869c843c1d58 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5191,8 +5191,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
*/
INIT_LIST_HEAD(&device_list);
if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
- list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
+ list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
list_add_tail(&tmp_adev->reset_list, &device_list);
+ if (gpu_reset_for_dev_remove && adev->shutdown)
+ tmp_adev->shutdown = true;
+ }
if (!list_is_first(&adev->reset_list, &device_list))
list_rotate_to_front(&adev->reset_list, &device_list);
device_list_handle = &device_list;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index 2e16210bebaf..81b22c1bd8df 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2207,6 +2207,7 @@ amdgpu_pci_remove(struct pci_dev *pdev)
if (need_to_reset_gpu) {
struct amdgpu_reset_context reset_context;
+ adev->shutdown = true;
memset(&reset_context, 0, sizeof(reset_context));
reset_context.method = AMD_RESET_METHOD_NONE;
reset_context.reset_req_dev = adev;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index c4848522be16..effa7df3ddbf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -511,6 +511,11 @@ static int psp_sw_fini(void *handle)
kfree(cmd);
cmd = NULL;
+ if (psp->km_ring.ring_mem)
+ amdgpu_bo_free_kernel(&adev->firmware.rbuf,
+ &psp->km_ring.ring_mem_mc_addr,
+ (void **)&psp->km_ring.ring_mem);
+
amdgpu_bo_free_kernel(&psp->fw_pri_bo,
&psp->fw_pri_mc_addr, &psp->fw_pri_buf);
amdgpu_bo_free_kernel(&psp->fence_buf_bo,