aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdgpu
diff options
context:
space:
mode:
authorRajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>2021-01-11 13:27:50 -0500
committerAlex Deucher <alexander.deucher@amd.com>2022-02-07 17:59:41 -0500
commit011bbb03024f5a22dc04eba370f9296f0cb83502 (patch)
treee5427c42e03acdc91872818eae0a00d5d488032f /drivers/gpu/drm/amd/amdgpu
parentdrm/amdkfd: CRIU Implement KFD restore ioctl (diff)
downloadlinux-dev-011bbb03024f5a22dc04eba370f9296f0cb83502.tar.xz
linux-dev-011bbb03024f5a22dc04eba370f9296f0cb83502.zip
drm/amdkfd: CRIU Implement KFD resume ioctl
This adds support to create userptr BOs on restore and introduces a new ioctl op to restart memory notifiers for the restored userptr BOs. When doing CRIU restore MMU notifications can happen anytime after we call amdgpu_mn_register. Prevent MMU notifications until we reach stage-4 of the restore process i.e. criu_resume ioctl op is received, and the process is ready to be resumed. This ioctl is different from other KFD CRIU ioctls since its called by CRIU master restore process for all the target processes being resumed by CRIU. Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Signed-off-by: David Yat Sin <david.yatsin@amd.com> Signed-off-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h6
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c53
2 files changed, 55 insertions, 4 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 395ba9566afe..4cb14c2fe53f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -131,6 +131,7 @@ struct amdkfd_process_info {
atomic_t evicted_bos;
struct delayed_work restore_userptr_work;
struct pid *pid;
+ bool block_mmu_notifications;
};
int amdgpu_amdkfd_init(void);
@@ -268,7 +269,7 @@ uint64_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *drm_priv);
int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
struct amdgpu_device *adev, uint64_t va, uint64_t size,
void *drm_priv, struct kgd_mem **mem,
- uint64_t *offset, uint32_t flags);
+ uint64_t *offset, uint32_t flags, bool criu_resume);
int amdgpu_amdkfd_gpuvm_free_memory_of_gpu(
struct amdgpu_device *adev, struct kgd_mem *mem, void *drm_priv,
uint64_t *size);
@@ -298,6 +299,9 @@ int amdgpu_amdkfd_get_tile_config(struct amdgpu_device *adev,
void amdgpu_amdkfd_ras_poison_consumption_handler(struct amdgpu_device *adev,
bool reset);
bool amdgpu_amdkfd_bo_mapped_to_dev(struct amdgpu_device *adev, struct kgd_mem *mem);
+void amdgpu_amdkfd_block_mmu_notifications(void *p);
+int amdgpu_amdkfd_criu_resume(void *p);
+
#if IS_ENABLED(CONFIG_HSA_AMD)
void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 5cf4bedca1d6..2e00c3fb4bd3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -842,7 +842,8 @@ static void remove_kgd_mem_from_kfd_bo_list(struct kgd_mem *mem,
*
* Returns 0 for success, negative errno for errors.
*/
-static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
+static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr,
+ bool criu_resume)
{
struct amdkfd_process_info *process_info = mem->process_info;
struct amdgpu_bo *bo = mem->bo;
@@ -864,6 +865,18 @@ static int init_user_pages(struct kgd_mem *mem, uint64_t user_addr)
goto out;
}
+ if (criu_resume) {
+ /*
+ * During a CRIU restore operation, the userptr buffer objects
+ * will be validated in the restore_userptr_work worker at a
+ * later stage when it is scheduled by another ioctl called by
+ * CRIU master process for the target pid for restore.
+ */
+ atomic_inc(&mem->invalid);
+ mutex_unlock(&process_info->lock);
+ return 0;
+ }
+
ret = amdgpu_ttm_tt_get_user_pages(bo, bo->tbo.ttm->pages);
if (ret) {
pr_err("%s: Failed to get user pages: %d\n", __func__, ret);
@@ -1452,10 +1465,39 @@ uint64_t amdgpu_amdkfd_gpuvm_get_process_page_dir(void *drm_priv)
return avm->pd_phys_addr;
}
+void amdgpu_amdkfd_block_mmu_notifications(void *p)
+{
+ struct amdkfd_process_info *pinfo = (struct amdkfd_process_info *)p;
+
+ mutex_lock(&pinfo->lock);
+ WRITE_ONCE(pinfo->block_mmu_notifications, true);
+ mutex_unlock(&pinfo->lock);
+}
+
+int amdgpu_amdkfd_criu_resume(void *p)
+{
+ int ret = 0;
+ struct amdkfd_process_info *pinfo = (struct amdkfd_process_info *)p;
+
+ mutex_lock(&pinfo->lock);
+ pr_debug("scheduling work\n");
+ atomic_inc(&pinfo->evicted_bos);
+ if (!READ_ONCE(pinfo->block_mmu_notifications)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ WRITE_ONCE(pinfo->block_mmu_notifications, false);
+ schedule_delayed_work(&pinfo->restore_userptr_work, 0);
+
+out_unlock:
+ mutex_unlock(&pinfo->lock);
+ return ret;
+}
+
int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
struct amdgpu_device *adev, uint64_t va, uint64_t size,
void *drm_priv, struct kgd_mem **mem,
- uint64_t *offset, uint32_t flags)
+ uint64_t *offset, uint32_t flags, bool criu_resume)
{
struct amdgpu_vm *avm = drm_priv_to_vm(drm_priv);
enum ttm_bo_type bo_type = ttm_bo_type_device;
@@ -1558,7 +1600,8 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
add_kgd_mem_to_kfd_bo_list(*mem, avm->process_info, user_addr);
if (user_addr) {
- ret = init_user_pages(*mem, user_addr);
+ pr_debug("creating userptr BO for user_addr = %llu\n", user_addr);
+ ret = init_user_pages(*mem, user_addr, criu_resume);
if (ret)
goto allocate_init_user_pages_failed;
} else if (flags & (KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL |
@@ -2062,6 +2105,10 @@ int amdgpu_amdkfd_evict_userptr(struct kgd_mem *mem,
int evicted_bos;
int r = 0;
+ /* Do not process MMU notifications until stage-4 IOCTL is received */
+ if (READ_ONCE(process_info->block_mmu_notifications))
+ return 0;
+
atomic_inc(&mem->invalid);
evicted_bos = atomic_inc_return(&process_info->evicted_bos);
if (evicted_bos == 1) {