diff options
author | Oak Zeng <Oak.Zeng@amd.com> | 2021-07-15 18:34:25 -0500 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2021-07-23 10:08:00 -0400 |
commit | 4f942aaeb19dbf2135931120cc806d459add4788 (patch) | |
tree | 4b40dddc9e1921155700d94e32052066ad81cfb9 /drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | |
parent | drm/amdkfd: Set priv_queue to NULL after it is freed (diff) | |
download | wireguard-linux-4f942aaeb19dbf2135931120cc806d459add4788.tar.xz wireguard-linux-4f942aaeb19dbf2135931120cc806d459add4788.zip |
drm/amdkfd: Fix a concurrency issue during kfd recovery
start_cpsch and stop_cpsch can be called during kfd device
initialization or during gpu reset/recovery. So they can
run concurrently. Currently in start_cpsch and stop_cpsch,
pm_init and pm_uninit is not protected by the dpm lock.
Imagine such a case that user use packet manager's function
to submit a pm4 packet to hang hws (ie through command
cat /sys/class/kfd/kfd/topology/nodes/1/gpu_id | sudo tee
/sys/kernel/debug/kfd/hang_hws), while kfd device is under
device reset/recovery so packet manager can be not initialized.
There will be unpredictable protection fault in such case.
This patch moves pm_init/uninit inside the dpm lock and check
packet manager is initialized before using packet manager
function.
Signed-off-by: Oak Zeng <Oak.Zeng@amd.com>
Acked-by: Christian Konig <christian.koenig@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 12 |
1 files changed, 9 insertions, 3 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index 6b2f5940c1f6..6b89ca6ddc65 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -1164,6 +1164,7 @@ static int start_cpsch(struct device_queue_manager *dqm) retval = 0; + dqm_lock(dqm); retval = pm_init(&dqm->packet_mgr, dqm); if (retval) goto fail_packet_manager_init; @@ -1186,7 +1187,6 @@ static int start_cpsch(struct device_queue_manager *dqm) init_interrupts(dqm); - dqm_lock(dqm); /* clear hang status when driver try to start the hw scheduler */ dqm->is_hws_hang = false; dqm->is_resetting = false; @@ -1199,6 +1199,7 @@ fail_allocate_vidmem: fail_set_sched_resources: pm_uninit(&dqm->packet_mgr, false); fail_packet_manager_init: + dqm_unlock(dqm); return retval; } @@ -1211,12 +1212,12 @@ static int stop_cpsch(struct device_queue_manager *dqm) unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); hanging = dqm->is_hws_hang || dqm->is_resetting; dqm->sched_running = false; - dqm_unlock(dqm); pm_release_ib(&dqm->packet_mgr); kfd_gtt_sa_free(dqm->dev, dqm->fence_mem); pm_uninit(&dqm->packet_mgr, hanging); + dqm_unlock(dqm); return 0; } @@ -2099,11 +2100,16 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data) return r; } -int dqm_debugfs_execute_queues(struct device_queue_manager *dqm) +int dqm_debugfs_hang_hws(struct device_queue_manager *dqm) { int r = 0; dqm_lock(dqm); + r = pm_debugfs_hang_hws(&dqm->packet_mgr); + if (r) { + dqm_unlock(dqm); + return r; + } dqm->active_runlist = true; r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0); dqm_unlock(dqm); |