diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd')
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h | 31 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm | 27 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 88 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 109 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_device.c | 8 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c | 47 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c | 12 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_events.c | 3 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c | 6 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 78 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_migrate.h | 2 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c | 92 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_process.c | 5 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 7 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 93 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_svm.h | 3 |
16 files changed, 394 insertions, 217 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h index 60a81649cf12..0c4c5499bb5c 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h @@ -742,7 +742,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0xbf88fffe, 0x877aff7f, 0x04000000, 0x8f7a857a, 0x886d7a6d, 0xb97b02dc, - 0x8f7b997b, 0xb97a2a05, + 0x8f7b997b, 0xb97a3a05, 0x807a817a, 0xbf0d997b, 0xbf850002, 0x8f7a897a, 0xbf820001, 0x8f7a8a7a, @@ -819,7 +819,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0xbefe037c, 0xbefc0370, 0xf4611c7a, 0xf8000000, 0x80708470, 0xbefc037e, - 0xb9702a05, 0x80708170, + 0xb9703a05, 0x80708170, 0xbf0d9973, 0xbf850002, 0x8f708970, 0xbf820001, 0x8f708a70, 0xb97a1e06, @@ -1069,7 +1069,7 @@ static const uint32_t cwsr_trap_nv1x_hex[] = { 0xb9f9f816, 0x876f7bff, 0xfffff800, 0x906f8b6f, 0xb9efa2c3, 0xb9f3f801, - 0xb96e2a05, 0x806e816e, + 0xb96e3a05, 0x806e816e, 0xbf0d9972, 0xbf850002, 0x8f6e896e, 0xbf820001, 0x8f6e8a6e, 0xb96f1e06, @@ -2114,7 +2114,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0x007a0000, 0x7e000280, 0xbefe037a, 0xbeff037b, 0xb97b02dc, 0x8f7b997b, - 0xb97a2a05, 0x807a817a, + 0xb97a3a05, 0x807a817a, 0xbf0d997b, 0xbf850002, 0x8f7a897a, 0xbf820001, 0x8f7a8a7a, 0xb97b1e06, @@ -2157,7 +2157,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0x01000000, 0xe0704100, 0x705d0100, 0xe0704200, 0x705d0200, 0xe0704300, - 0x705d0300, 0xb9702a05, + 0x705d0300, 0xb9703a05, 0x80708170, 0xbf0d9973, 0xbf850002, 0x8f708970, 0xbf820001, 0x8f708a70, @@ -2189,7 +2189,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xbefe03ff, 0x0000ffff, 0xbeff0380, 0xe0704000, 0x705d0200, 0xbefe03c1, - 0xb9702a05, 0x80708170, + 0xb9703a05, 0x80708170, 0xbf0d9973, 0xbf850002, 0x8f708970, 0xbf820001, 0x8f708a70, 0xb97a1e06, @@ -2475,7 +2475,7 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xb9ef4803, 0x876f7bff, 0xfffff800, 0x906f8b6f, 0xb9efa2c3, 0xb9f3f801, - 0xb96e2a05, 0x806e816e, + 0xb96e3a05, 0x806e816e, 0xbf0d9972, 0xbf850002, 0x8f6e896e, 0xbf820001, 0x8f6e8a6e, 0xb96f1e06, @@ -2494,11 +2494,13 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, 0x00000000, }; - static const uint32_t cwsr_trap_gfx11_hex[] = { - 0xbfa00001, 0xbfa0021b, + 0xbfa00001, 0xbfa00221, 0xb0804006, 0xb8f8f802, - 0x91788678, 0xb8fbf803, + 0x9178ff78, 0x00020006, + 0xb8fbf803, 0xbf0d9e6d, + 0xbfa10001, 0xbfbd0000, + 0xbf0d9f6d, 0xbfa20006, 0x8b6eff78, 0x00002000, 0xbfa10009, 0x8b6eff6d, 0x00ff0000, 0xbfa2001e, @@ -2766,7 +2768,7 @@ static const uint32_t cwsr_trap_gfx11_hex[] = { 0x701d0000, 0x807d817d, 0x8070ff70, 0x00000080, 0xbf0a7b7d, 0xbfa2fff8, - 0xbfa00141, 0xbef4007e, + 0xbfa00146, 0xbef4007e, 0x8b75ff7f, 0x0000ffff, 0x8c75ff75, 0x00040000, 0xbef60080, 0xbef700ff, @@ -2926,8 +2928,11 @@ static const uint32_t cwsr_trap_gfx11_hex[] = { 0xf8000074, 0xbf89fc07, 0x8b6dff6d, 0x0000ffff, 0x8bfe7e7e, 0x8bea6a6a, - 0xb97af802, 0xbe804a6c, - 0xbfb00000, 0xbf9f0000, + 0xb8eef802, 0xbf0d866e, + 0xbfa20002, 0xb97af802, + 0xbe80486c, 0xb97af802, + 0xbe804a6c, 0xbfb00000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, + 0xbf9f0000, 0x00000000, }; diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm index 250ab007399b..8b92c33c2a7c 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm @@ -43,12 +43,14 @@ #define HAVE_XNACK (ASIC_FAMILY < CHIP_SIENNA_CICHLID) #define HAVE_SENDMSG_RTN (ASIC_FAMILY >= CHIP_PLUM_BONITO) #define HAVE_BUFFER_LDS_LOAD (ASIC_FAMILY < CHIP_PLUM_BONITO) +#define SW_SA_TRAP (ASIC_FAMILY >= CHIP_PLUM_BONITO) var SINGLE_STEP_MISSED_WORKAROUND = 1 //workaround for lost MODE.DEBUG_EN exception when SAVECTX raised var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 var SQ_WAVE_STATUS_HALT_MASK = 0x2000 var SQ_WAVE_STATUS_ECC_ERR_MASK = 0x20000 +var SQ_WAVE_STATUS_TRAP_EN_SHIFT = 6 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 @@ -183,6 +185,19 @@ L_SKIP_RESTORE: s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) +#if SW_SA_TRAP + // If ttmp1[30] is set then issue s_barrier to unblock dependent waves. + s_bitcmp1_b32 s_save_pc_hi, 30 + s_cbranch_scc0 L_TRAP_NO_BARRIER + s_barrier + +L_TRAP_NO_BARRIER: + // If ttmp1[31] is set then trap may occur early. + // Spin wait until SAVECTX exception is raised. + s_bitcmp1_b32 s_save_pc_hi, 31 + s_cbranch_scc1 L_CHECK_SAVE +#endif + s_and_b32 ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK s_cbranch_scc0 L_NOT_HALTED @@ -1061,8 +1076,20 @@ L_RESTORE_HWREG: s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 + +#if SW_SA_TRAP + // If traps are enabled then return to the shader with PRIV=0. + // Otherwise retain PRIV=1 for subsequent context save requests. + s_getreg_b32 s_restore_tmp, hwreg(HW_REG_STATUS) + s_bitcmp1_b32 s_restore_tmp, SQ_WAVE_STATUS_TRAP_EN_SHIFT + s_cbranch_scc1 L_RETURN_WITHOUT_PRIV + s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu + s_setpc_b64 [s_restore_pc_lo, s_restore_pc_hi] +L_RETURN_WITHOUT_PRIV: +#endif + s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution L_END_PGM: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index dc774ddf3445..6d291aa6386b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -327,6 +327,12 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, goto err_bind_process; } + if (!pdd->doorbell_index && + kfd_alloc_process_doorbells(dev, &pdd->doorbell_index) < 0) { + err = -ENOMEM; + goto err_alloc_doorbells; + } + /* Starting with GFX11, wptr BOs must be mapped to GART for MES to determine work * on unmapped queues for usermode queue oversubscription (no aggregated doorbell) */ @@ -404,6 +410,7 @@ err_create_queue: if (wptr_bo) amdgpu_amdkfd_free_gtt_mem(dev->adev, wptr_bo); err_wptr_map_gart: +err_alloc_doorbells: err_bind_process: err_pdd: mutex_unlock(&p->mutex); @@ -869,14 +876,11 @@ static int kfd_ioctl_wait_events(struct file *filp, struct kfd_process *p, void *data) { struct kfd_ioctl_wait_events_args *args = data; - int err; - err = kfd_wait_on_events(p, args->num_events, + return kfd_wait_on_events(p, args->num_events, (void __user *)args->events_ptr, (args->wait_for_all != 0), &args->timeout, &args->wait_result); - - return err; } static int kfd_ioctl_set_scratch_backing_va(struct file *filep, struct kfd_process *p, void *data) @@ -1092,6 +1096,10 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, goto err_unlock; } offset = kfd_get_process_doorbells(pdd); + if (!offset) { + err = -ENOMEM; + goto err_unlock; + } } else if (flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) { if (args->size != PAGE_SIZE) { err = -EINVAL; @@ -1576,6 +1584,8 @@ static int kfd_ioctl_smi_events(struct file *filep, return kfd_smi_event_open(pdd->dev, &args->anon_fd); } +#if IS_ENABLED(CONFIG_HSA_AMD_SVM) + static int kfd_ioctl_set_xnack_mode(struct file *filep, struct kfd_process *p, void *data) { @@ -1586,22 +1596,29 @@ static int kfd_ioctl_set_xnack_mode(struct file *filep, if (args->xnack_enabled >= 0) { if (!list_empty(&p->pqm.queues)) { pr_debug("Process has user queues running\n"); - mutex_unlock(&p->mutex); - return -EBUSY; + r = -EBUSY; + goto out_unlock; } - if (args->xnack_enabled && !kfd_process_xnack_mode(p, true)) + + if (p->xnack_enabled == args->xnack_enabled) + goto out_unlock; + + if (args->xnack_enabled && !kfd_process_xnack_mode(p, true)) { r = -EPERM; - else - p->xnack_enabled = args->xnack_enabled; + goto out_unlock; + } + + r = svm_range_switch_xnack_reserve_mem(p, args->xnack_enabled); } else { args->xnack_enabled = p->xnack_enabled; } + +out_unlock: mutex_unlock(&p->mutex); return r; } -#if IS_ENABLED(CONFIG_HSA_AMD_SVM) static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, void *data) { struct kfd_ioctl_svm_args *args = data; @@ -1621,6 +1638,11 @@ static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, void *data) return r; } #else +static int kfd_ioctl_set_xnack_mode(struct file *filep, + struct kfd_process *p, void *data) +{ + return -EPERM; +} static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, void *data) { return -EPERM; @@ -1928,7 +1950,7 @@ static int criu_checkpoint(struct file *filep, { int ret; uint32_t num_devices, num_bos, num_objects; - uint64_t priv_size, priv_offset = 0; + uint64_t priv_size, priv_offset = 0, bo_priv_offset; if (!args->devices || !args->bos || !args->priv_data) return -EINVAL; @@ -1972,38 +1994,34 @@ static int criu_checkpoint(struct file *filep, if (ret) goto exit_unlock; - ret = criu_checkpoint_bos(p, num_bos, (uint8_t __user *)args->bos, - (uint8_t __user *)args->priv_data, &priv_offset); - if (ret) - goto exit_unlock; + /* Leave room for BOs in the private data. They need to be restored + * before events, but we checkpoint them last to simplify the error + * handling. + */ + bo_priv_offset = priv_offset; + priv_offset += num_bos * sizeof(struct kfd_criu_bo_priv_data); if (num_objects) { ret = kfd_criu_checkpoint_queues(p, (uint8_t __user *)args->priv_data, &priv_offset); if (ret) - goto close_bo_fds; + goto exit_unlock; ret = kfd_criu_checkpoint_events(p, (uint8_t __user *)args->priv_data, &priv_offset); if (ret) - goto close_bo_fds; + goto exit_unlock; ret = kfd_criu_checkpoint_svm(p, (uint8_t __user *)args->priv_data, &priv_offset); if (ret) - goto close_bo_fds; + goto exit_unlock; } -close_bo_fds: - if (ret) { - /* If IOCTL returns err, user assumes all FDs opened in criu_dump_bos are closed */ - uint32_t i; - struct kfd_criu_bo_bucket *bo_buckets = (struct kfd_criu_bo_bucket *) args->bos; - - for (i = 0; i < num_bos; i++) { - if (bo_buckets[i].alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) - close_fd(bo_buckets[i].dmabuf_fd); - } - } + /* This must be the last thing in this function that can fail. + * Otherwise we leak dmabuf file descriptors. + */ + ret = criu_checkpoint_bos(p, num_bos, (uint8_t __user *)args->bos, + (uint8_t __user *)args->priv_data, &bo_priv_offset); exit_unlock: mutex_unlock(&p->mutex); @@ -2145,6 +2163,12 @@ static int criu_restore_devices(struct kfd_process *p, ret = PTR_ERR(pdd); goto exit; } + + if (!pdd->doorbell_index && + kfd_alloc_process_doorbells(pdd->dev, &pdd->doorbell_index) < 0) { + ret = -ENOMEM; + goto exit; + } } /* @@ -2173,6 +2197,8 @@ static int criu_restore_memory_of_gpu(struct kfd_process_device *pdd, return -EINVAL; offset = kfd_get_process_doorbells(pdd); + if (!offset) + return -ENOMEM; } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) { /* MMIO BOs need remapped bus address */ if (bo_bucket->size != PAGE_SIZE) { @@ -2847,7 +2873,6 @@ static int kfd_mmio_mmap(struct kfd_dev *dev, struct kfd_process *process, struct vm_area_struct *vma) { phys_addr_t address; - int ret; if (vma->vm_end - vma->vm_start != PAGE_SIZE) return -EINVAL; @@ -2867,12 +2892,11 @@ static int kfd_mmio_mmap(struct kfd_dev *dev, struct kfd_process *process, process->pasid, (unsigned long long) vma->vm_start, address, vma->vm_flags, PAGE_SIZE); - ret = io_remap_pfn_range(vma, + return io_remap_pfn_range(vma, vma->vm_start, address >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot); - return ret; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c index a5409531a2fd..8bfdfd062ff6 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c @@ -795,6 +795,102 @@ static struct kfd_gpu_cache_info yellow_carp_cache_info[] = { }, }; +static struct kfd_gpu_cache_info gfx1037_cache_info[] = { + { + /* TCP L1 Cache per CU */ + .cache_size = 16, + .cache_level = 1, + .flags = (CRAT_CACHE_FLAGS_ENABLED | + CRAT_CACHE_FLAGS_DATA_CACHE | + CRAT_CACHE_FLAGS_SIMD_CACHE), + .num_cu_shared = 1, + }, + { + /* Scalar L1 Instruction Cache per SQC */ + .cache_size = 32, + .cache_level = 1, + .flags = (CRAT_CACHE_FLAGS_ENABLED | + CRAT_CACHE_FLAGS_INST_CACHE | + CRAT_CACHE_FLAGS_SIMD_CACHE), + .num_cu_shared = 2, + }, + { + /* Scalar L1 Data Cache per SQC */ + .cache_size = 16, + .cache_level = 1, + .flags = (CRAT_CACHE_FLAGS_ENABLED | + CRAT_CACHE_FLAGS_DATA_CACHE | + CRAT_CACHE_FLAGS_SIMD_CACHE), + .num_cu_shared = 2, + }, + { + /* GL1 Data Cache per SA */ + .cache_size = 128, + .cache_level = 1, + .flags = (CRAT_CACHE_FLAGS_ENABLED | + CRAT_CACHE_FLAGS_DATA_CACHE | + CRAT_CACHE_FLAGS_SIMD_CACHE), + .num_cu_shared = 2, + }, + { + /* L2 Data Cache per GPU (Total Tex Cache) */ + .cache_size = 256, + .cache_level = 2, + .flags = (CRAT_CACHE_FLAGS_ENABLED | + CRAT_CACHE_FLAGS_DATA_CACHE | + CRAT_CACHE_FLAGS_SIMD_CACHE), + .num_cu_shared = 2, + }, +}; + +static struct kfd_gpu_cache_info gc_10_3_6_cache_info[] = { + { + /* TCP L1 Cache per CU */ + .cache_size = 16, + .cache_level = 1, + .flags = (CRAT_CACHE_FLAGS_ENABLED | + CRAT_CACHE_FLAGS_DATA_CACHE | + CRAT_CACHE_FLAGS_SIMD_CACHE), + .num_cu_shared = 1, + }, + { + /* Scalar L1 Instruction Cache per SQC */ + .cache_size = 32, + .cache_level = 1, + .flags = (CRAT_CACHE_FLAGS_ENABLED | + CRAT_CACHE_FLAGS_INST_CACHE | + CRAT_CACHE_FLAGS_SIMD_CACHE), + .num_cu_shared = 2, + }, + { + /* Scalar L1 Data Cache per SQC */ + .cache_size = 16, + .cache_level = 1, + .flags = (CRAT_CACHE_FLAGS_ENABLED | + CRAT_CACHE_FLAGS_DATA_CACHE | + CRAT_CACHE_FLAGS_SIMD_CACHE), + .num_cu_shared = 2, + }, + { + /* GL1 Data Cache per SA */ + .cache_size = 128, + .cache_level = 1, + .flags = (CRAT_CACHE_FLAGS_ENABLED | + CRAT_CACHE_FLAGS_DATA_CACHE | + CRAT_CACHE_FLAGS_SIMD_CACHE), + .num_cu_shared = 2, + }, + { + /* L2 Data Cache per GPU (Total Tex Cache) */ + .cache_size = 256, + .cache_level = 2, + .flags = (CRAT_CACHE_FLAGS_ENABLED | + CRAT_CACHE_FLAGS_DATA_CACHE | + CRAT_CACHE_FLAGS_SIMD_CACHE), + .num_cu_shared = 2, + }, +}; + static void kfd_populated_cu_info_cpu(struct kfd_topology_device *dev, struct crat_subtype_computeunit *cu) { @@ -1514,14 +1610,21 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev, num_of_cache_types = ARRAY_SIZE(beige_goby_cache_info); break; case IP_VERSION(10, 3, 3): - case IP_VERSION(10, 3, 6): /* TODO: Double check these on production silicon */ - case IP_VERSION(10, 3, 7): /* TODO: Double check these on production silicon */ pcache_info = yellow_carp_cache_info; num_of_cache_types = ARRAY_SIZE(yellow_carp_cache_info); break; + case IP_VERSION(10, 3, 6): + pcache_info = gc_10_3_6_cache_info; + num_of_cache_types = ARRAY_SIZE(gc_10_3_6_cache_info); + break; + case IP_VERSION(10, 3, 7): + pcache_info = gfx1037_cache_info; + num_of_cache_types = ARRAY_SIZE(gfx1037_cache_info); + break; case IP_VERSION(11, 0, 0): case IP_VERSION(11, 0, 1): case IP_VERSION(11, 0, 2): + case IP_VERSION(11, 0, 3): pcache_info = cache_info; num_of_cache_types = kfd_fill_gpu_cache_info_from_gfx_config(kdev, pcache_info); @@ -2283,7 +2386,7 @@ static int kfd_create_vcrat_image_gpu(void *pcrat_image, /* Fill in Subtype: IO_LINKS * Only direct links are added here which is Link from GPU to - * to its NUMA node. Indirect links are added by userspace. + * its NUMA node. Indirect links are added by userspace. */ sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + cache_mem_filled); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 22c0929d410b..65a1d4f9004b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -91,6 +91,7 @@ static void kfd_device_info_set_sdma_info(struct kfd_dev *kfd) case IP_VERSION(6, 0, 0): case IP_VERSION(6, 0, 1): case IP_VERSION(6, 0, 2): + case IP_VERSION(6, 0, 3): kfd->device_info.num_sdma_queues_per_engine = 8; break; default: @@ -103,6 +104,7 @@ static void kfd_device_info_set_sdma_info(struct kfd_dev *kfd) switch (sdma_version) { case IP_VERSION(6, 0, 0): case IP_VERSION(6, 0, 2): + case IP_VERSION(6, 0, 3): /* Reserve 1 for paging and 1 for gfx */ kfd->device_info.num_reserved_sdma_queues_per_engine = 2; /* BIT(0)=engine-0 queue-0; BIT(1)=engine-1 queue-0; BIT(2)=engine-0 queue-1; ... */ @@ -150,6 +152,7 @@ static void kfd_device_info_set_event_interrupt_class(struct kfd_dev *kfd) case IP_VERSION(11, 0, 0): case IP_VERSION(11, 0, 1): case IP_VERSION(11, 0, 2): + case IP_VERSION(11, 0, 3): kfd->device_info.event_interrupt_class = &event_interrupt_class_v11; break; default: @@ -399,6 +402,11 @@ struct kfd_dev *kgd2kfd_probe(struct amdgpu_device *adev, bool vf) gfx_target_version = 110002; f2g = &gfx_v11_kfd2kgd; break; + case IP_VERSION(11, 0, 3): + /* Note: Compiler version is 11.0.1 while HW version is 11.0.3 */ + gfx_target_version = 110001; + f2g = &gfx_v11_kfd2kgd; + break; default: break; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index e83725a28106..ecb4c3abc629 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -205,6 +205,8 @@ static int add_queue_mes(struct device_queue_manager *dqm, struct queue *q, } queue_input.is_kfd_process = 1; + queue_input.is_aql_queue = (q->properties.format == KFD_QUEUE_FORMAT_AQL); + queue_input.queue_size = q->properties.queue_size >> 2; queue_input.paging = false; queue_input.tba_addr = qpd->tba_addr; @@ -1240,6 +1242,24 @@ static void init_interrupts(struct device_queue_manager *dqm) dqm->dev->kfd2kgd->init_interrupts(dqm->dev->adev, i); } +static void init_sdma_bitmaps(struct device_queue_manager *dqm) +{ + unsigned int num_sdma_queues = + min_t(unsigned int, sizeof(dqm->sdma_bitmap)*8, + get_num_sdma_queues(dqm)); + unsigned int num_xgmi_sdma_queues = + min_t(unsigned int, sizeof(dqm->xgmi_sdma_bitmap)*8, + get_num_xgmi_sdma_queues(dqm)); + + if (num_sdma_queues) + dqm->sdma_bitmap = GENMASK_ULL(num_sdma_queues-1, 0); + if (num_xgmi_sdma_queues) + dqm->xgmi_sdma_bitmap = GENMASK_ULL(num_xgmi_sdma_queues-1, 0); + + dqm->sdma_bitmap &= ~get_reserved_sdma_queues_bitmap(dqm); + pr_info("sdma_bitmap: %llx\n", dqm->sdma_bitmap); +} + static int initialize_nocpsch(struct device_queue_manager *dqm) { int pipe, queue; @@ -1268,11 +1288,7 @@ static int initialize_nocpsch(struct device_queue_manager *dqm) memset(dqm->vmid_pasid, 0, sizeof(dqm->vmid_pasid)); - dqm->sdma_bitmap = ~0ULL >> (64 - get_num_sdma_queues(dqm)); - dqm->sdma_bitmap &= ~(get_reserved_sdma_queues_bitmap(dqm)); - pr_info("sdma_bitmap: %llx\n", dqm->sdma_bitmap); - - dqm->xgmi_sdma_bitmap = ~0ULL >> (64 - get_num_xgmi_sdma_queues(dqm)); + init_sdma_bitmaps(dqm); return 0; } @@ -1450,9 +1466,6 @@ static int set_sched_resources(struct device_queue_manager *dqm) static int initialize_cpsch(struct device_queue_manager *dqm) { - uint64_t num_sdma_queues; - uint64_t num_xgmi_sdma_queues; - pr_debug("num of pipes: %d\n", get_pipes_per_mec(dqm)); mutex_init(&dqm->lock_hidden); @@ -1461,24 +1474,10 @@ static int initialize_cpsch(struct device_queue_manager *dqm) dqm->active_cp_queue_count = 0; dqm->gws_queue_count = 0; dqm->active_runlist = false; - - num_sdma_queues = get_num_sdma_queues(dqm); - if (num_sdma_queues >= BITS_PER_TYPE(dqm->sdma_bitmap)) - dqm->sdma_bitmap = ULLONG_MAX; - else - dqm->sdma_bitmap = (BIT_ULL(num_sdma_queues) - 1); - - dqm->sdma_bitmap &= ~(get_reserved_sdma_queues_bitmap(dqm)); - pr_info("sdma_bitmap: %llx\n", dqm->sdma_bitmap); - - num_xgmi_sdma_queues = get_num_xgmi_sdma_queues(dqm); - if (num_xgmi_sdma_queues >= BITS_PER_TYPE(dqm->xgmi_sdma_bitmap)) - dqm->xgmi_sdma_bitmap = ULLONG_MAX; - else - dqm->xgmi_sdma_bitmap = (BIT_ULL(num_xgmi_sdma_queues) - 1); - INIT_WORK(&dqm->hw_exception_work, kfd_process_hw_exception); + init_sdma_bitmaps(dqm); + return 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c index cb3d2ccc5100..cd4e61bf0493 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_doorbell.c @@ -157,6 +157,8 @@ int kfd_doorbell_mmap(struct kfd_dev *dev, struct kfd_process *process, /* Calculate physical address of doorbell */ address = kfd_get_process_doorbells(pdd); + if (!address) + return -ENOMEM; vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE | VM_DONTDUMP | VM_PFNMAP; @@ -275,6 +277,13 @@ uint64_t kfd_get_number_elems(struct kfd_dev *kfd) phys_addr_t kfd_get_process_doorbells(struct kfd_process_device *pdd) { + if (!pdd->doorbell_index) { + int r = kfd_alloc_process_doorbells(pdd->dev, + &pdd->doorbell_index); + if (r) + return 0; + } + return pdd->dev->doorbell_base + pdd->doorbell_index * kfd_doorbell_process_slice(pdd->dev); } @@ -294,6 +303,9 @@ int kfd_alloc_process_doorbells(struct kfd_dev *kfd, unsigned int *doorbell_inde if (r > 0) *doorbell_index = r; + if (r < 0) + pr_err("Failed to allocate process doorbells\n"); + return r; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index 83e3ce9f6049..729d26d648af 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -506,6 +506,7 @@ int kfd_criu_restore_event(struct file *devkfd, ret = create_other_event(p, ev, &ev_priv->event_id); break; } + mutex_unlock(&p->event_mutex); exit: if (ret) @@ -513,8 +514,6 @@ exit: kfree(ev_priv); - mutex_unlock(&p->event_mutex); - return ret; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c index a6fcbeeb7428..0d53f6067422 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v11.c @@ -350,11 +350,11 @@ static void event_interrupt_wq_v11(struct kfd_dev *dev, print_sq_intr_info_inst(context_id0, context_id1); sq_int_priv = REG_GET_FIELD(context_id0, SQ_INTERRUPT_WORD_WAVE_CTXID0, PRIV); - if (sq_int_priv /*&& (kfd_set_dbg_ev_from_interrupt(dev, pasid, + /*if (sq_int_priv && (kfd_set_dbg_ev_from_interrupt(dev, pasid, KFD_CTXID0_DOORBELL_ID(context_id0), KFD_CTXID0_TRAP_CODE(context_id0), - NULL, 0))*/) - return; + NULL, 0))) + return;*/ break; case SQ_INTERRUPT_WORD_ENCODING_ERROR: print_sq_intr_info_error(context_id0, context_id1); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index b059a77b6081..22b077ac9a19 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -223,7 +223,7 @@ svm_migrate_get_vram_page(struct svm_range *prange, unsigned long pfn) page = pfn_to_page(pfn); svm_range_bo_ref(prange->svm_bo); page->zone_device_data = prange->svm_bo; - lock_page(page); + zone_device_page_init(page); } static void @@ -322,12 +322,13 @@ svm_migrate_copy_to_vram(struct amdgpu_device *adev, struct svm_range *prange, for (i = j = 0; i < npages; i++) { struct page *spage; + dst[i] = cursor.start + (j << PAGE_SHIFT); + migrate->dst[i] = svm_migrate_addr_to_pfn(adev, dst[i]); + svm_migrate_get_vram_page(prange, migrate->dst[i]); + migrate->dst[i] = migrate_pfn(migrate->dst[i]); + spage = migrate_pfn_to_page(migrate->src[i]); if (spage && !is_zone_device_page(spage)) { - dst[i] = cursor.start + (j << PAGE_SHIFT); - migrate->dst[i] = svm_migrate_addr_to_pfn(adev, dst[i]); - svm_migrate_get_vram_page(prange, migrate->dst[i]); - migrate->dst[i] = migrate_pfn(migrate->dst[i]); src[i] = dma_map_page(dev, spage, 0, PAGE_SIZE, DMA_TO_DEVICE); r = dma_mapping_error(dev, src[i]); @@ -409,7 +410,7 @@ svm_migrate_vma_to_vram(struct amdgpu_device *adev, struct svm_range *prange, uint64_t npages = (end - start) >> PAGE_SHIFT; struct kfd_process_device *pdd; struct dma_fence *mfence = NULL; - struct migrate_vma migrate; + struct migrate_vma migrate = { 0 }; unsigned long cpages = 0; dma_addr_t *scratch; void *buf; @@ -522,9 +523,6 @@ svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t best_loc, pr_debug("svms 0x%p [0x%lx 0x%lx] to gpu 0x%x\n", prange->svms, prange->start, prange->last, best_loc); - /* FIXME: workaround for page locking bug with invalid pages */ - svm_range_prefault(prange, mm, SVM_ADEV_PGMAP_OWNER(adev)); - start = prange->start << PAGE_SHIFT; end = (prange->last + 1) << PAGE_SHIFT; @@ -668,7 +666,7 @@ out_oom: static long svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange, struct vm_area_struct *vma, uint64_t start, uint64_t end, - uint32_t trigger) + uint32_t trigger, struct page *fault_page) { struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms); uint64_t npages = (end - start) >> PAGE_SHIFT; @@ -676,7 +674,7 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange, unsigned long cpages = 0; struct kfd_process_device *pdd; struct dma_fence *mfence = NULL; - struct migrate_vma migrate; + struct migrate_vma migrate = { 0 }; dma_addr_t *scratch; void *buf; int r = -ENOMEM; @@ -699,6 +697,7 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange, migrate.src = buf; migrate.dst = migrate.src + npages; + migrate.fault_page = fault_page; scratch = (dma_addr_t *)(migrate.dst + npages); kfd_smi_event_migration_start(adev->kfd.dev, p->lead_thread->pid, @@ -766,7 +765,7 @@ out: * 0 - OK, otherwise error code */ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm, - uint32_t trigger) + uint32_t trigger, struct page *fault_page) { struct amdgpu_device *adev; struct vm_area_struct *vma; @@ -807,7 +806,8 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm, } next = min(vma->vm_end, end); - r = svm_migrate_vma_to_ram(adev, prange, vma, addr, next, trigger); + r = svm_migrate_vma_to_ram(adev, prange, vma, addr, next, trigger, + fault_page); if (r < 0) { pr_debug("failed %ld to migrate prange %p\n", r, prange); break; @@ -851,7 +851,7 @@ svm_migrate_vram_to_vram(struct svm_range *prange, uint32_t best_loc, pr_debug("from gpu 0x%x to gpu 0x%x\n", prange->actual_loc, best_loc); do { - r = svm_migrate_vram_to_ram(prange, mm, trigger); + r = svm_migrate_vram_to_ram(prange, mm, trigger, NULL); if (r) return r; } while (prange->actual_loc && --retries); @@ -886,7 +886,7 @@ svm_migrate_to_vram(struct svm_range *prange, uint32_t best_loc, static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf) { unsigned long addr = vmf->address; - struct vm_area_struct *vma; + struct svm_range_bo *svm_bo; enum svm_work_list_ops op; struct svm_range *parent; struct svm_range *prange; @@ -894,29 +894,42 @@ static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf) struct mm_struct *mm; int r = 0; - vma = vmf->vma; - mm = vma->vm_mm; + svm_bo = vmf->page->zone_device_data; + if (!svm_bo) { + pr_debug("failed get device page at addr 0x%lx\n", addr); + return VM_FAULT_SIGBUS; + } + if (!mmget_not_zero(svm_bo->eviction_fence->mm)) { + pr_debug("addr 0x%lx of process mm is destroyed\n", addr); + return VM_FAULT_SIGBUS; + } + + mm = svm_bo->eviction_fence->mm; + if (mm != vmf->vma->vm_mm) + pr_debug("addr 0x%lx is COW mapping in child process\n", addr); - p = kfd_lookup_process_by_mm(vma->vm_mm); + p = kfd_lookup_process_by_mm(mm); if (!p) { pr_debug("failed find process at fault address 0x%lx\n", addr); - return VM_FAULT_SIGBUS; + r = VM_FAULT_SIGBUS; + goto out_mmput; } if (READ_ONCE(p->svms.faulting_task) == current) { pr_debug("skipping ram migration\n"); - kfd_unref_process(p); - return 0; + r = 0; + goto out_unref_process; } - addr >>= PAGE_SHIFT; + pr_debug("CPU page fault svms 0x%p address 0x%lx\n", &p->svms, addr); + addr >>= PAGE_SHIFT; mutex_lock(&p->svms.lock); prange = svm_range_from_addr(&p->svms, addr, &parent); if (!prange) { - pr_debug("cannot find svm range at 0x%lx\n", addr); + pr_debug("failed get range svms 0x%p addr 0x%lx\n", &p->svms, addr); r = -EFAULT; - goto out; + goto out_unlock_svms; } mutex_lock(&parent->migrate_mutex); @@ -938,10 +951,12 @@ static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf) goto out_unlock_prange; } - r = svm_migrate_vram_to_ram(prange, mm, KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU); + r = svm_migrate_vram_to_ram(prange, vmf->vma->vm_mm, + KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU, + vmf->page); if (r) - pr_debug("failed %d migrate 0x%p [0x%lx 0x%lx] to ram\n", r, - prange, prange->start, prange->last); + pr_debug("failed %d migrate svms 0x%p range 0x%p [0x%lx 0x%lx]\n", + r, prange->svms, prange, prange->start, prange->last); /* xnack on, update mapping on GPUs with ACCESS_IN_PLACE */ if (p->xnack_enabled && parent == prange) @@ -955,12 +970,13 @@ out_unlock_prange: if (prange != parent) mutex_unlock(&prange->migrate_mutex); mutex_unlock(&parent->migrate_mutex); -out: +out_unlock_svms: mutex_unlock(&p->svms.lock); - kfd_unref_process(p); - +out_unref_process: pr_debug("CPU fault svms 0x%p address 0x%lx done\n", &p->svms, addr); - + kfd_unref_process(p); +out_mmput: + mmput(mm); return r ? VM_FAULT_SIGBUS : 0; } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h index b3f0754b32fa..a5d7e6d22264 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h @@ -43,7 +43,7 @@ enum MIGRATION_COPY_DIR { int svm_migrate_to_vram(struct svm_range *prange, uint32_t best_loc, struct mm_struct *mm, uint32_t trigger); int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm, - uint32_t trigger); + uint32_t trigger, struct page *fault_page); unsigned long svm_migrate_addr_to_pfn(struct amdgpu_device *adev, unsigned long addr); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c index b8e14c2cc295..4f6390f3236e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v11.c @@ -126,6 +126,10 @@ static void init_mqd(struct mqd_manager *mm, void **mqd, m->compute_static_thread_mgmt_se1 = 0xFFFFFFFF; m->compute_static_thread_mgmt_se2 = 0xFFFFFFFF; m->compute_static_thread_mgmt_se3 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se4 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se5 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se6 = 0xFFFFFFFF; + m->compute_static_thread_mgmt_se7 = 0xFFFFFFFF; m->cp_hqd_persistent_state = CP_HQD_PERSISTENT_STATE__PRELOAD_REQ_MASK | 0x55 << CP_HQD_PERSISTENT_STATE__PRELOAD_SIZE__SHIFT; @@ -177,14 +181,6 @@ static int load_mqd(struct mqd_manager *mm, void *mqd, return r; } -static int hiq_load_mqd_kiq(struct mqd_manager *mm, void *mqd, - uint32_t pipe_id, uint32_t queue_id, - struct queue_properties *p, struct mm_struct *mms) -{ - return mm->dev->kfd2kgd->hiq_mqd_load(mm->dev->adev, mqd, pipe_id, - queue_id, p->doorbell_off); -} - static void update_mqd(struct mqd_manager *mm, void *mqd, struct queue_properties *q, struct mqd_update_info *minfo) @@ -256,31 +252,6 @@ static uint32_t read_doorbell_id(void *mqd) return m->queue_doorbell_id0; } -static int destroy_mqd(struct mqd_manager *mm, void *mqd, - enum kfd_preempt_type type, - unsigned int timeout, uint32_t pipe_id, - uint32_t queue_id) -{ - return mm->dev->kfd2kgd->hqd_destroy - (mm->dev->adev, mqd, type, timeout, - pipe_id, queue_id); -} - -static void free_mqd(struct mqd_manager *mm, void *mqd, - struct kfd_mem_obj *mqd_mem_obj) -{ - kfd_gtt_sa_free(mm->dev, mqd_mem_obj); -} - -static bool is_occupied(struct mqd_manager *mm, void *mqd, - uint64_t queue_address, uint32_t pipe_id, - uint32_t queue_id) -{ - return mm->dev->kfd2kgd->hqd_is_occupied( - mm->dev->adev, queue_address, - pipe_id, queue_id); -} - static int get_wave_state(struct mqd_manager *mm, void *mqd, void __user *ctl_stack, u32 *ctl_stack_used_size, @@ -349,15 +320,6 @@ static void init_mqd_sdma(struct mqd_manager *mm, void **mqd, mm->update_mqd(mm, m, q, NULL); } -static int load_mqd_sdma(struct mqd_manager *mm, void *mqd, - uint32_t pipe_id, uint32_t queue_id, - struct queue_properties *p, struct mm_struct *mms) -{ - return mm->dev->kfd2kgd->hqd_sdma_load(mm->dev->adev, mqd, - (uint32_t __user *)p->write_ptr, - mms); -} - #define SDMA_RLC_DUMMY_DEFAULT 0xf static void update_mqd_sdma(struct mqd_manager *mm, void *mqd, @@ -371,7 +333,8 @@ static void update_mqd_sdma(struct mqd_manager *mm, void *mqd, << SDMA0_QUEUE0_RB_CNTL__RB_SIZE__SHIFT | q->vmid << SDMA0_QUEUE0_RB_CNTL__RB_VMID__SHIFT | 1 << SDMA0_QUEUE0_RB_CNTL__RPTR_WRITEBACK_ENABLE__SHIFT | - 6 << SDMA0_QUEUE0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT; + 6 << SDMA0_QUEUE0_RB_CNTL__RPTR_WRITEBACK_TIMER__SHIFT | + 1 << SDMA0_QUEUE0_RB_CNTL__F32_WPTR_POLL_ENABLE__SHIFT; m->sdmax_rlcx_rb_base = lower_32_bits(q->queue_address >> 8); m->sdmax_rlcx_rb_base_hi = upper_32_bits(q->queue_address >> 8); @@ -389,25 +352,6 @@ static void update_mqd_sdma(struct mqd_manager *mm, void *mqd, q->is_active = QUEUE_IS_ACTIVE(*q); } -/* - * * preempt type here is ignored because there is only one way - * * to preempt sdma queue - */ -static int destroy_mqd_sdma(struct mqd_manager *mm, void *mqd, - enum kfd_preempt_type type, - unsigned int timeout, uint32_t pipe_id, - uint32_t queue_id) -{ - return mm->dev->kfd2kgd->hqd_sdma_destroy(mm->dev->adev, mqd, timeout); -} - -static bool is_occupied_sdma(struct mqd_manager *mm, void *mqd, - uint64_t queue_address, uint32_t pipe_id, - uint32_t queue_id) -{ - return mm->dev->kfd2kgd->hqd_sdma_is_occupied(mm->dev->adev, mqd); -} - #if defined(CONFIG_DEBUG_FS) static int debugfs_show_mqd(struct seq_file *m, void *data) @@ -445,11 +389,11 @@ struct mqd_manager *mqd_manager_init_v11(enum KFD_MQD_TYPE type, pr_debug("%s@%i\n", __func__, __LINE__); mqd->allocate_mqd = allocate_mqd; mqd->init_mqd = init_mqd; - mqd->free_mqd = free_mqd; + mqd->free_mqd = kfd_free_mqd_cp; mqd->load_mqd = load_mqd; mqd->update_mqd = update_mqd; - mqd->destroy_mqd = destroy_mqd; - mqd->is_occupied = is_occupied; + mqd->destroy_mqd = kfd_destroy_mqd_cp; + mqd->is_occupied = kfd_is_occupied_cp; mqd->mqd_size = sizeof(struct v11_compute_mqd); mqd->get_wave_state = get_wave_state; #if defined(CONFIG_DEBUG_FS) @@ -462,10 +406,10 @@ struct mqd_manager *mqd_manager_init_v11(enum KFD_MQD_TYPE type, mqd->allocate_mqd = allocate_hiq_mqd; mqd->init_mqd = init_mqd_hiq; mqd->free_mqd = free_mqd_hiq_sdma; - mqd->load_mqd = hiq_load_mqd_kiq; + mqd->load_mqd = kfd_hiq_load_mqd_kiq; mqd->update_mqd = update_mqd; - mqd->destroy_mqd = destroy_mqd; - mqd->is_occupied = is_occupied; + mqd->destroy_mqd = kfd_destroy_mqd_cp; + mqd->is_occupied = kfd_is_occupied_cp; mqd->mqd_size = sizeof(struct v11_compute_mqd); #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; @@ -476,11 +420,11 @@ struct mqd_manager *mqd_manager_init_v11(enum KFD_MQD_TYPE type, case KFD_MQD_TYPE_DIQ: mqd->allocate_mqd = allocate_mqd; mqd->init_mqd = init_mqd_hiq; - mqd->free_mqd = free_mqd; + mqd->free_mqd = kfd_free_mqd_cp; mqd->load_mqd = load_mqd; mqd->update_mqd = update_mqd; - mqd->destroy_mqd = destroy_mqd; - mqd->is_occupied = is_occupied; + mqd->destroy_mqd = kfd_destroy_mqd_cp; + mqd->is_occupied = kfd_is_occupied_cp; mqd->mqd_size = sizeof(struct v11_compute_mqd); #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd; @@ -491,10 +435,10 @@ struct mqd_manager *mqd_manager_init_v11(enum KFD_MQD_TYPE type, mqd->allocate_mqd = allocate_sdma_mqd; mqd->init_mqd = init_mqd_sdma; mqd->free_mqd = free_mqd_hiq_sdma; - mqd->load_mqd = load_mqd_sdma; + mqd->load_mqd = kfd_load_mqd_sdma; mqd->update_mqd = update_mqd_sdma; - mqd->destroy_mqd = destroy_mqd_sdma; - mqd->is_occupied = is_occupied_sdma; + mqd->destroy_mqd = kfd_destroy_mqd_sdma; + mqd->is_occupied = kfd_is_occupied_sdma; mqd->mqd_size = sizeof(struct v11_sdma_mqd); #if defined(CONFIG_DEBUG_FS) mqd->debugfs_show_mqd = debugfs_show_mqd_sdma; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 6c83a519b3a1..951b63677248 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -1499,11 +1499,6 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, if (!pdd) return NULL; - if (kfd_alloc_process_doorbells(dev, &pdd->doorbell_index) < 0) { - pr_err("Failed to alloc doorbell for pdd\n"); - goto err_free_pdd; - } - if (init_doorbell_bitmap(&pdd->qpd, dev)) { pr_err("Failed to init doorbell for process\n"); goto err_free_pdd; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index 6e3e7f54381b..5137476ec18e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -857,6 +857,13 @@ int kfd_criu_restore_queue(struct kfd_process *p, ret = -EINVAL; goto exit; } + + if (!pdd->doorbell_index && + kfd_alloc_process_doorbells(pdd->dev, &pdd->doorbell_index) < 0) { + ret = -ENOMEM; + goto exit; + } + /* data stored in this order: mqd, ctl_stack */ mqd = q_extra_data; ctl_stack = mqd + q_data->mqd_size; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 11074cc8c333..64fdf63093a0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -278,7 +278,7 @@ static void svm_range_free(struct svm_range *prange, bool update_mem_usage) svm_range_free_dma_mappings(prange); if (update_mem_usage && !p->xnack_enabled) { - pr_debug("unreserve mem limit: %lld\n", size); + pr_debug("unreserve prange 0x%p size: 0x%llx\n", prange, size); amdgpu_amdkfd_unreserve_mem_limit(NULL, size, KFD_IOC_ALLOC_MEM_FLAGS_USERPTR); } @@ -2913,13 +2913,15 @@ retry_write_locked: */ if (prange->actual_loc) r = svm_migrate_vram_to_ram(prange, mm, - KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU); + KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, + NULL); else r = 0; } } else { r = svm_migrate_vram_to_ram(prange, mm, - KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU); + KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, + NULL); } if (r) { pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n", @@ -2956,6 +2958,64 @@ out: return r; } +int +svm_range_switch_xnack_reserve_mem(struct kfd_process *p, bool xnack_enabled) +{ + struct svm_range *prange, *pchild; + uint64_t reserved_size = 0; + uint64_t size; + int r = 0; + + pr_debug("switching xnack from %d to %d\n", p->xnack_enabled, xnack_enabled); + + mutex_lock(&p->svms.lock); + + list_for_each_entry(prange, &p->svms.list, list) { + svm_range_lock(prange); + list_for_each_entry(pchild, &prange->child_list, child_list) { + size = (pchild->last - pchild->start + 1) << PAGE_SHIFT; + if (xnack_enabled) { + amdgpu_amdkfd_unreserve_mem_limit(NULL, size, + KFD_IOC_ALLOC_MEM_FLAGS_USERPTR); + } else { + r = amdgpu_amdkfd_reserve_mem_limit(NULL, size, + KFD_IOC_ALLOC_MEM_FLAGS_USERPTR); + if (r) + goto out_unlock; + reserved_size += size; + } + } + + size = (prange->last - prange->start + 1) << PAGE_SHIFT; + if (xnack_enabled) { + amdgpu_amdkfd_unreserve_mem_limit(NULL, size, + KFD_IOC_ALLOC_MEM_FLAGS_USERPTR); + } else { + r = amdgpu_amdkfd_reserve_mem_limit(NULL, size, + KFD_IOC_ALLOC_MEM_FLAGS_USERPTR); + if (r) + goto out_unlock; + reserved_size += size; + } +out_unlock: + svm_range_unlock(prange); + if (r) + break; + } + + if (r) + amdgpu_amdkfd_unreserve_mem_limit(NULL, reserved_size, + KFD_IOC_ALLOC_MEM_FLAGS_USERPTR); + else + /* Change xnack mode must be inside svms lock, to avoid race with + * svm_range_deferred_list_work unreserve memory in parallel. + */ + p->xnack_enabled = xnack_enabled; + + mutex_unlock(&p->svms.lock); + return r; +} + void svm_range_list_fini(struct kfd_process *p) { struct svm_range *prange; @@ -3181,28 +3241,6 @@ out: return best_loc; } -/* FIXME: This is a workaround for page locking bug when some pages are - * invalid during migration to VRAM - */ -void svm_range_prefault(struct svm_range *prange, struct mm_struct *mm, - void *owner) -{ - struct hmm_range *hmm_range; - int r; - - if (prange->validated_once) - return; - - r = amdgpu_hmm_range_get_pages(&prange->notifier, mm, NULL, - prange->start << PAGE_SHIFT, - prange->npages, &hmm_range, - false, true, owner); - if (!r) { - amdgpu_hmm_range_get_pages_done(hmm_range); - prange->validated_once = true; - } -} - /* svm_range_trigger_migration - start page migration if prefetch loc changed * @mm: current process mm_struct * @prange: svm range structure @@ -3242,7 +3280,8 @@ svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange, return 0; if (!best_loc) { - r = svm_migrate_vram_to_ram(prange, mm, KFD_MIGRATE_TRIGGER_PREFETCH); + r = svm_migrate_vram_to_ram(prange, mm, + KFD_MIGRATE_TRIGGER_PREFETCH, NULL); *migrated = !r; return r; } @@ -3303,7 +3342,7 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work) mutex_lock(&prange->migrate_mutex); do { r = svm_migrate_vram_to_ram(prange, mm, - KFD_MIGRATE_TRIGGER_TTM_EVICTION); + KFD_MIGRATE_TRIGGER_TTM_EVICTION, NULL); } while (!r && prange->actual_loc && --retries); if (!r && prange->actual_loc) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h index cfac13ad06ef..7a33b93f9df6 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h @@ -181,8 +181,6 @@ void schedule_deferred_list_work(struct svm_range_list *svms); void svm_range_dma_unmap(struct device *dev, dma_addr_t *dma_addr, unsigned long offset, unsigned long npages); void svm_range_free_dma_mappings(struct svm_range *prange); -void svm_range_prefault(struct svm_range *prange, struct mm_struct *mm, - void *owner); int svm_range_get_info(struct kfd_process *p, uint32_t *num_svm_ranges, uint64_t *svm_priv_data_size); int kfd_criu_checkpoint_svm(struct kfd_process *p, @@ -205,6 +203,7 @@ void svm_range_list_lock_and_flush_work(struct svm_range_list *svms, struct mm_s void svm_range_bo_unref_async(struct svm_range_bo *svm_bo); void svm_range_set_max_pages(struct amdgpu_device *adev); +int svm_range_switch_xnack_reserve_mem(struct kfd_process *p, bool xnack_enabled); #else |