diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_chardev.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 1724 |
1 files changed, 1267 insertions, 457 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 4bfc0c8ab764..6d291aa6386b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -1,5 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT /* - * Copyright 2014 Advanced Micro Devices, Inc. + * Copyright 2014-2022 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -33,14 +34,16 @@ #include <linux/time.h> #include <linux/mm.h> #include <linux/mman.h> +#include <linux/ptrace.h> #include <linux/dma-buf.h> -#include <asm/processor.h> +#include <linux/fdtable.h> +#include <linux/processor.h> #include "kfd_priv.h" #include "kfd_device_queue_manager.h" -#include "kfd_dbgmgr.h" #include "kfd_svm.h" #include "amdgpu_amdkfd.h" #include "kfd_smi_events.h" +#include "amdgpu_dma_buf.h" static long kfd_ioctl(struct file *, unsigned int, unsigned long); static int kfd_open(struct inode *, struct file *); @@ -62,6 +65,25 @@ static int kfd_char_dev_major = -1; static struct class *kfd_class; struct device *kfd_device; +static inline struct kfd_process_device *kfd_lock_pdd_by_id(struct kfd_process *p, __u32 gpu_id) +{ + struct kfd_process_device *pdd; + + mutex_lock(&p->mutex); + pdd = kfd_process_device_data_by_id(p, gpu_id); + + if (pdd) + return pdd; + + mutex_unlock(&p->mutex); + return NULL; +} + +static inline void kfd_unlock_pdd(struct kfd_process_device *pdd) +{ + mutex_unlock(&pdd->process->mutex); +} + int kfd_chardev_init(void) { int err = 0; @@ -101,11 +123,6 @@ void kfd_chardev_exit(void) kfd_device = NULL; } -struct device *kfd_chardev(void) -{ - return kfd_device; -} - static int kfd_open(struct inode *inode, struct file *filep) { @@ -282,6 +299,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, struct kfd_process_device *pdd; struct queue_properties q_properties; uint32_t doorbell_offset_in_process = 0; + struct amdgpu_bo *wptr_bo = NULL; memset(&q_properties, 0, sizeof(struct queue_properties)); @@ -292,26 +310,72 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, return err; pr_debug("Looking for gpu id 0x%x\n", args->gpu_id); - dev = kfd_device_by_id(args->gpu_id); - if (!dev) { - pr_debug("Could not find gpu id 0x%x\n", args->gpu_id); - return -EINVAL; - } mutex_lock(&p->mutex); + pdd = kfd_process_device_data_by_id(p, args->gpu_id); + if (!pdd) { + pr_debug("Could not find gpu id 0x%x\n", args->gpu_id); + err = -EINVAL; + goto err_pdd; + } + dev = pdd->dev; + pdd = kfd_bind_process_to_device(dev, p); if (IS_ERR(pdd)) { err = -ESRCH; goto err_bind_process; } + if (!pdd->doorbell_index && + kfd_alloc_process_doorbells(dev, &pdd->doorbell_index) < 0) { + err = -ENOMEM; + goto err_alloc_doorbells; + } + + /* Starting with GFX11, wptr BOs must be mapped to GART for MES to determine work + * on unmapped queues for usermode queue oversubscription (no aggregated doorbell) + */ + if (dev->shared_resources.enable_mes && + ((dev->adev->mes.sched_version & AMDGPU_MES_API_VERSION_MASK) + >> AMDGPU_MES_API_VERSION_SHIFT) >= 2) { + struct amdgpu_bo_va_mapping *wptr_mapping; + struct amdgpu_vm *wptr_vm; + + wptr_vm = drm_priv_to_vm(pdd->drm_priv); + err = amdgpu_bo_reserve(wptr_vm->root.bo, false); + if (err) + goto err_wptr_map_gart; + + wptr_mapping = amdgpu_vm_bo_lookup_mapping( + wptr_vm, args->write_pointer_address >> PAGE_SHIFT); + amdgpu_bo_unreserve(wptr_vm->root.bo); + if (!wptr_mapping) { + pr_err("Failed to lookup wptr bo\n"); + err = -EINVAL; + goto err_wptr_map_gart; + } + + wptr_bo = wptr_mapping->bo_va->base.bo; + if (wptr_bo->tbo.base.size > PAGE_SIZE) { + pr_err("Requested GART mapping for wptr bo larger than one page\n"); + err = -EINVAL; + goto err_wptr_map_gart; + } + + err = amdgpu_amdkfd_map_gtt_bo_to_gart(dev->adev, wptr_bo); + if (err) { + pr_err("Failed to map wptr bo to GART\n"); + goto err_wptr_map_gart; + } + } + pr_debug("Creating queue for PASID 0x%x on gpu 0x%x\n", p->pasid, dev->id); - err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, &queue_id, - &doorbell_offset_in_process); + err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, &queue_id, wptr_bo, + NULL, NULL, NULL, &doorbell_offset_in_process); if (err != 0) goto err_create_queue; @@ -343,7 +407,12 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, return 0; err_create_queue: + if (wptr_bo) + amdgpu_amdkfd_free_gtt_mem(dev->adev, wptr_bo); +err_wptr_map_gart: +err_alloc_doorbells: err_bind_process: +err_pdd: mutex_unlock(&p->mutex); return err; } @@ -490,7 +559,6 @@ static int kfd_ioctl_set_memory_policy(struct file *filep, struct kfd_process *p, void *data) { struct kfd_ioctl_set_memory_policy_args *args = data; - struct kfd_dev *dev; int err = 0; struct kfd_process_device *pdd; enum cache_policy default_policy, alternate_policy; @@ -505,13 +573,15 @@ static int kfd_ioctl_set_memory_policy(struct file *filep, return -EINVAL; } - dev = kfd_device_by_id(args->gpu_id); - if (!dev) - return -EINVAL; - mutex_lock(&p->mutex); + pdd = kfd_process_device_data_by_id(p, args->gpu_id); + if (!pdd) { + pr_debug("Could not find gpu id 0x%x\n", args->gpu_id); + err = -EINVAL; + goto err_pdd; + } - pdd = kfd_bind_process_to_device(dev, p); + pdd = kfd_bind_process_to_device(pdd->dev, p); if (IS_ERR(pdd)) { err = -ESRCH; goto out; @@ -524,7 +594,7 @@ static int kfd_ioctl_set_memory_policy(struct file *filep, (args->alternate_policy == KFD_IOC_CACHE_POLICY_COHERENT) ? cache_policy_coherent : cache_policy_noncoherent; - if (!dev->dqm->ops.set_cache_memory_policy(dev->dqm, + if (!pdd->dev->dqm->ops.set_cache_memory_policy(pdd->dev->dqm, &pdd->qpd, default_policy, alternate_policy, @@ -533,6 +603,7 @@ static int kfd_ioctl_set_memory_policy(struct file *filep, err = -EINVAL; out: +err_pdd: mutex_unlock(&p->mutex); return err; @@ -542,17 +613,18 @@ static int kfd_ioctl_set_trap_handler(struct file *filep, struct kfd_process *p, void *data) { struct kfd_ioctl_set_trap_handler_args *args = data; - struct kfd_dev *dev; int err = 0; struct kfd_process_device *pdd; - dev = kfd_device_by_id(args->gpu_id); - if (!dev) - return -EINVAL; - mutex_lock(&p->mutex); - pdd = kfd_bind_process_to_device(dev, p); + pdd = kfd_process_device_data_by_id(p, args->gpu_id); + if (!pdd) { + err = -EINVAL; + goto err_pdd; + } + + pdd = kfd_bind_process_to_device(pdd->dev, p); if (IS_ERR(pdd)) { err = -ESRCH; goto out; @@ -561,6 +633,7 @@ static int kfd_ioctl_set_trap_handler(struct file *filep, kfd_process_set_trap_handler(&pdd->qpd, args->tba_addr, args->tma_addr); out: +err_pdd: mutex_unlock(&p->mutex); return err; @@ -569,289 +642,40 @@ out: static int kfd_ioctl_dbg_register(struct file *filep, struct kfd_process *p, void *data) { - struct kfd_ioctl_dbg_register_args *args = data; - struct kfd_dev *dev; - struct kfd_dbgmgr *dbgmgr_ptr; - struct kfd_process_device *pdd; - bool create_ok; - long status = 0; - - dev = kfd_device_by_id(args->gpu_id); - if (!dev) - return -EINVAL; - - if (dev->adev->asic_type == CHIP_CARRIZO) { - pr_debug("kfd_ioctl_dbg_register not supported on CZ\n"); - return -EINVAL; - } - - mutex_lock(&p->mutex); - mutex_lock(kfd_get_dbgmgr_mutex()); - - /* - * make sure that we have pdd, if this the first queue created for - * this process - */ - pdd = kfd_bind_process_to_device(dev, p); - if (IS_ERR(pdd)) { - status = PTR_ERR(pdd); - goto out; - } - - if (!dev->dbgmgr) { - /* In case of a legal call, we have no dbgmgr yet */ - create_ok = kfd_dbgmgr_create(&dbgmgr_ptr, dev); - if (create_ok) { - status = kfd_dbgmgr_register(dbgmgr_ptr, p); - if (status != 0) - kfd_dbgmgr_destroy(dbgmgr_ptr); - else - dev->dbgmgr = dbgmgr_ptr; - } - } else { - pr_debug("debugger already registered\n"); - status = -EINVAL; - } - -out: - mutex_unlock(kfd_get_dbgmgr_mutex()); - mutex_unlock(&p->mutex); - - return status; + return -EPERM; } static int kfd_ioctl_dbg_unregister(struct file *filep, struct kfd_process *p, void *data) { - struct kfd_ioctl_dbg_unregister_args *args = data; - struct kfd_dev *dev; - long status; - - dev = kfd_device_by_id(args->gpu_id); - if (!dev || !dev->dbgmgr) - return -EINVAL; - - if (dev->adev->asic_type == CHIP_CARRIZO) { - pr_debug("kfd_ioctl_dbg_unregister not supported on CZ\n"); - return -EINVAL; - } - - mutex_lock(kfd_get_dbgmgr_mutex()); - - status = kfd_dbgmgr_unregister(dev->dbgmgr, p); - if (!status) { - kfd_dbgmgr_destroy(dev->dbgmgr); - dev->dbgmgr = NULL; - } - - mutex_unlock(kfd_get_dbgmgr_mutex()); - - return status; + return -EPERM; } -/* - * Parse and generate variable size data structure for address watch. - * Total size of the buffer and # watch points is limited in order - * to prevent kernel abuse. (no bearing to the much smaller HW limitation - * which is enforced by dbgdev module) - * please also note that the watch address itself are not "copied from user", - * since it be set into the HW in user mode values. - * - */ static int kfd_ioctl_dbg_address_watch(struct file *filep, struct kfd_process *p, void *data) { - struct kfd_ioctl_dbg_address_watch_args *args = data; - struct kfd_dev *dev; - struct dbg_address_watch_info aw_info; - unsigned char *args_buff; - long status; - void __user *cmd_from_user; - uint64_t watch_mask_value = 0; - unsigned int args_idx = 0; - - memset((void *) &aw_info, 0, sizeof(struct dbg_address_watch_info)); - - dev = kfd_device_by_id(args->gpu_id); - if (!dev) - return -EINVAL; - - if (dev->adev->asic_type == CHIP_CARRIZO) { - pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n"); - return -EINVAL; - } - - cmd_from_user = (void __user *) args->content_ptr; - - /* Validate arguments */ - - if ((args->buf_size_in_bytes > MAX_ALLOWED_AW_BUFF_SIZE) || - (args->buf_size_in_bytes <= sizeof(*args) + sizeof(int) * 2) || - (cmd_from_user == NULL)) - return -EINVAL; - - /* this is the actual buffer to work with */ - args_buff = memdup_user(cmd_from_user, - args->buf_size_in_bytes - sizeof(*args)); - if (IS_ERR(args_buff)) - return PTR_ERR(args_buff); - - aw_info.process = p; - - aw_info.num_watch_points = *((uint32_t *)(&args_buff[args_idx])); - args_idx += sizeof(aw_info.num_watch_points); - - aw_info.watch_mode = (enum HSA_DBG_WATCH_MODE *) &args_buff[args_idx]; - args_idx += sizeof(enum HSA_DBG_WATCH_MODE) * aw_info.num_watch_points; - - /* - * set watch address base pointer to point on the array base - * within args_buff - */ - aw_info.watch_address = (uint64_t *) &args_buff[args_idx]; - - /* skip over the addresses buffer */ - args_idx += sizeof(aw_info.watch_address) * aw_info.num_watch_points; - - if (args_idx >= args->buf_size_in_bytes - sizeof(*args)) { - status = -EINVAL; - goto out; - } - - watch_mask_value = (uint64_t) args_buff[args_idx]; - - if (watch_mask_value > 0) { - /* - * There is an array of masks. - * set watch mask base pointer to point on the array base - * within args_buff - */ - aw_info.watch_mask = (uint64_t *) &args_buff[args_idx]; - - /* skip over the masks buffer */ - args_idx += sizeof(aw_info.watch_mask) * - aw_info.num_watch_points; - } else { - /* just the NULL mask, set to NULL and skip over it */ - aw_info.watch_mask = NULL; - args_idx += sizeof(aw_info.watch_mask); - } - - if (args_idx >= args->buf_size_in_bytes - sizeof(args)) { - status = -EINVAL; - goto out; - } - - /* Currently HSA Event is not supported for DBG */ - aw_info.watch_event = NULL; - - mutex_lock(kfd_get_dbgmgr_mutex()); - - status = kfd_dbgmgr_address_watch(dev->dbgmgr, &aw_info); - - mutex_unlock(kfd_get_dbgmgr_mutex()); - -out: - kfree(args_buff); - - return status; + return -EPERM; } /* Parse and generate fixed size data structure for wave control */ static int kfd_ioctl_dbg_wave_control(struct file *filep, struct kfd_process *p, void *data) { - struct kfd_ioctl_dbg_wave_control_args *args = data; - struct kfd_dev *dev; - struct dbg_wave_control_info wac_info; - unsigned char *args_buff; - uint32_t computed_buff_size; - long status; - void __user *cmd_from_user; - unsigned int args_idx = 0; - - memset((void *) &wac_info, 0, sizeof(struct dbg_wave_control_info)); - - /* we use compact form, independent of the packing attribute value */ - computed_buff_size = sizeof(*args) + - sizeof(wac_info.mode) + - sizeof(wac_info.operand) + - sizeof(wac_info.dbgWave_msg.DbgWaveMsg) + - sizeof(wac_info.dbgWave_msg.MemoryVA) + - sizeof(wac_info.trapId); - - dev = kfd_device_by_id(args->gpu_id); - if (!dev) - return -EINVAL; - - if (dev->adev->asic_type == CHIP_CARRIZO) { - pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n"); - return -EINVAL; - } - - /* input size must match the computed "compact" size */ - if (args->buf_size_in_bytes != computed_buff_size) { - pr_debug("size mismatch, computed : actual %u : %u\n", - args->buf_size_in_bytes, computed_buff_size); - return -EINVAL; - } - - cmd_from_user = (void __user *) args->content_ptr; - - if (cmd_from_user == NULL) - return -EINVAL; - - /* copy the entire buffer from user */ - - args_buff = memdup_user(cmd_from_user, - args->buf_size_in_bytes - sizeof(*args)); - if (IS_ERR(args_buff)) - return PTR_ERR(args_buff); - - /* move ptr to the start of the "pay-load" area */ - wac_info.process = p; - - wac_info.operand = *((enum HSA_DBG_WAVEOP *)(&args_buff[args_idx])); - args_idx += sizeof(wac_info.operand); - - wac_info.mode = *((enum HSA_DBG_WAVEMODE *)(&args_buff[args_idx])); - args_idx += sizeof(wac_info.mode); - - wac_info.trapId = *((uint32_t *)(&args_buff[args_idx])); - args_idx += sizeof(wac_info.trapId); - - wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value = - *((uint32_t *)(&args_buff[args_idx])); - wac_info.dbgWave_msg.MemoryVA = NULL; - - mutex_lock(kfd_get_dbgmgr_mutex()); - - pr_debug("Calling dbg manager process %p, operand %u, mode %u, trapId %u, message %u\n", - wac_info.process, wac_info.operand, - wac_info.mode, wac_info.trapId, - wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value); - - status = kfd_dbgmgr_wave_control(dev->dbgmgr, &wac_info); - - pr_debug("Returned status of dbg manager is %ld\n", status); - - mutex_unlock(kfd_get_dbgmgr_mutex()); - - kfree(args_buff); - - return status; + return -EPERM; } static int kfd_ioctl_get_clock_counters(struct file *filep, struct kfd_process *p, void *data) { struct kfd_ioctl_get_clock_counters_args *args = data; - struct kfd_dev *dev; + struct kfd_process_device *pdd; - dev = kfd_device_by_id(args->gpu_id); - if (dev) + mutex_lock(&p->mutex); + pdd = kfd_process_device_data_by_id(p, args->gpu_id); + mutex_unlock(&p->mutex); + if (pdd) /* Reading GPU clock counter from KGD */ - args->gpu_clock_counter = amdgpu_amdkfd_get_gpu_clock_counter(dev->adev); + args->gpu_clock_counter = amdgpu_amdkfd_get_gpu_clock_counter(pdd->dev->adev); else /* Node without GPU resource */ args->gpu_clock_counter = 0; @@ -1007,57 +831,11 @@ static int kfd_ioctl_create_event(struct file *filp, struct kfd_process *p, * through the event_page_offset field. */ if (args->event_page_offset) { - struct kfd_dev *kfd; - struct kfd_process_device *pdd; - void *mem, *kern_addr; - uint64_t size; - - kfd = kfd_device_by_id(GET_GPU_ID(args->event_page_offset)); - if (!kfd) { - pr_err("Getting device by id failed in %s\n", __func__); - return -EINVAL; - } - mutex_lock(&p->mutex); - - if (p->signal_page) { - pr_err("Event page is already set\n"); - err = -EINVAL; - goto out_unlock; - } - - pdd = kfd_bind_process_to_device(kfd, p); - if (IS_ERR(pdd)) { - err = PTR_ERR(pdd); - goto out_unlock; - } - - mem = kfd_process_device_translate_handle(pdd, - GET_IDR_HANDLE(args->event_page_offset)); - if (!mem) { - pr_err("Can't find BO, offset is 0x%llx\n", - args->event_page_offset); - err = -EINVAL; - goto out_unlock; - } - - err = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(kfd->adev, - mem, &kern_addr, &size); - if (err) { - pr_err("Failed to map event page to kernel\n"); - goto out_unlock; - } - - err = kfd_event_page_set(p, kern_addr, size); - if (err) { - pr_err("Failed to set event page\n"); - amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(kfd->adev, mem); - goto out_unlock; - } - - p->signal_handle = args->event_page_offset; - + err = kfd_kmap_event_page(p, args->event_page_offset); mutex_unlock(&p->mutex); + if (err) + return err; } err = kfd_event_create(filp, p, args->event_type, @@ -1066,10 +844,7 @@ static int kfd_ioctl_create_event(struct file *filp, struct kfd_process *p, &args->event_page_offset, &args->event_slot_index); - return err; - -out_unlock: - mutex_unlock(&p->mutex); + pr_debug("Created event (id:0x%08x) (%s)\n", args->event_id, __func__); return err; } @@ -1101,14 +876,11 @@ static int kfd_ioctl_wait_events(struct file *filp, struct kfd_process *p, void *data) { struct kfd_ioctl_wait_events_args *args = data; - int err; - err = kfd_wait_on_events(p, args->num_events, + return kfd_wait_on_events(p, args->num_events, (void __user *)args->events_ptr, (args->wait_for_all != 0), - args->timeout, &args->wait_result); - - return err; + &args->timeout, &args->wait_result); } static int kfd_ioctl_set_scratch_backing_va(struct file *filep, struct kfd_process *p, void *data) @@ -1118,11 +890,13 @@ static int kfd_ioctl_set_scratch_backing_va(struct file *filep, struct kfd_dev *dev; long err; - dev = kfd_device_by_id(args->gpu_id); - if (!dev) - return -EINVAL; - mutex_lock(&p->mutex); + pdd = kfd_process_device_data_by_id(p, args->gpu_id); + if (!pdd) { + err = -EINVAL; + goto err_pdd; + } + dev = pdd->dev; pdd = kfd_bind_process_to_device(dev, p); if (IS_ERR(pdd)) { @@ -1142,6 +916,7 @@ static int kfd_ioctl_set_scratch_backing_va(struct file *filep, return 0; bind_process_to_device_fail: +err_pdd: mutex_unlock(&p->mutex); return err; } @@ -1150,15 +925,17 @@ static int kfd_ioctl_get_tile_config(struct file *filep, struct kfd_process *p, void *data) { struct kfd_ioctl_get_tile_config_args *args = data; - struct kfd_dev *dev; + struct kfd_process_device *pdd; struct tile_config config; int err = 0; - dev = kfd_device_by_id(args->gpu_id); - if (!dev) + mutex_lock(&p->mutex); + pdd = kfd_process_device_data_by_id(p, args->gpu_id); + mutex_unlock(&p->mutex); + if (!pdd) return -EINVAL; - amdgpu_amdkfd_get_tile_config(dev->adev, &config); + amdgpu_amdkfd_get_tile_config(pdd->dev->adev, &config); args->gb_addr_config = config.gb_addr_config; args->num_banks = config.num_banks; @@ -1193,40 +970,37 @@ static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p, { struct kfd_ioctl_acquire_vm_args *args = data; struct kfd_process_device *pdd; - struct kfd_dev *dev; struct file *drm_file; int ret; - dev = kfd_device_by_id(args->gpu_id); - if (!dev) - return -EINVAL; - drm_file = fget(args->drm_fd); if (!drm_file) return -EINVAL; mutex_lock(&p->mutex); - - pdd = kfd_get_process_device_data(dev, p); + pdd = kfd_process_device_data_by_id(p, args->gpu_id); if (!pdd) { ret = -EINVAL; - goto err_unlock; + goto err_pdd; } if (pdd->drm_file) { ret = pdd->drm_file == drm_file ? 0 : -EBUSY; - goto err_unlock; + goto err_drm_file; } ret = kfd_process_device_init_vm(pdd, drm_file); if (ret) goto err_unlock; + /* On success, the PDD keeps the drm_file reference */ mutex_unlock(&p->mutex); return 0; err_unlock: +err_pdd: +err_drm_file: mutex_unlock(&p->mutex); fput(drm_file); return ret; @@ -1234,8 +1008,6 @@ err_unlock: bool kfd_dev_is_large_bar(struct kfd_dev *dev) { - struct kfd_local_mem_info mem_info; - if (debug_largebar) { pr_debug("Simulate large-bar allocation on non large-bar machine\n"); return true; @@ -1244,13 +1016,25 @@ bool kfd_dev_is_large_bar(struct kfd_dev *dev) if (dev->use_iommu_v2) return false; - amdgpu_amdkfd_get_local_mem_info(dev->adev, &mem_info); - if (mem_info.local_mem_size_private == 0 && - mem_info.local_mem_size_public > 0) + if (dev->local_mem_info.local_mem_size_private == 0 && + dev->local_mem_info.local_mem_size_public > 0) return true; return false; } +static int kfd_ioctl_get_available_memory(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_get_available_memory_args *args = data; + struct kfd_process_device *pdd = kfd_lock_pdd_by_id(p, args->gpu_id); + + if (!pdd) + return -EINVAL; + args->available = amdgpu_amdkfd_get_available_memory(pdd->dev->adev); + kfd_unlock_pdd(pdd); + return 0; +} + static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, struct kfd_process *p, void *data) { @@ -1283,19 +1067,23 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, } mutex_unlock(&p->svms.lock); #endif - dev = kfd_device_by_id(args->gpu_id); - if (!dev) - return -EINVAL; + mutex_lock(&p->mutex); + pdd = kfd_process_device_data_by_id(p, args->gpu_id); + if (!pdd) { + err = -EINVAL; + goto err_pdd; + } + + dev = pdd->dev; if ((flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) && (flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) && !kfd_dev_is_large_bar(dev)) { pr_err("Alloc host visible vram on small bar is not allowed\n"); - return -EINVAL; + err = -EINVAL; + goto err_large_bar; } - mutex_lock(&p->mutex); - pdd = kfd_bind_process_to_device(dev, p); if (IS_ERR(pdd)) { err = PTR_ERR(pdd); @@ -1308,6 +1096,10 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, goto err_unlock; } offset = kfd_get_process_doorbells(pdd); + if (!offset) { + err = -ENOMEM; + goto err_unlock; + } } else if (flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) { if (args->size != PAGE_SIZE) { err = -EINVAL; @@ -1323,7 +1115,7 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep, err = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu( dev->adev, args->va_addr, args->size, pdd->drm_priv, (struct kgd_mem **) &mem, &offset, - flags); + flags, false); if (err) goto err_unlock; @@ -1356,6 +1148,8 @@ err_free: amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->adev, (struct kgd_mem *)mem, pdd->drm_priv, NULL); err_unlock: +err_pdd: +err_large_bar: mutex_unlock(&p->mutex); return err; } @@ -1366,14 +1160,9 @@ static int kfd_ioctl_free_memory_of_gpu(struct file *filep, struct kfd_ioctl_free_memory_of_gpu_args *args = data; struct kfd_process_device *pdd; void *mem; - struct kfd_dev *dev; int ret; uint64_t size = 0; - dev = kfd_device_by_id(GET_GPU_ID(args->handle)); - if (!dev) - return -EINVAL; - mutex_lock(&p->mutex); /* * Safeguard to prevent user space from freeing signal BO. @@ -1385,11 +1174,11 @@ static int kfd_ioctl_free_memory_of_gpu(struct file *filep, goto err_unlock; } - pdd = kfd_get_process_device_data(dev, p); + pdd = kfd_process_device_data_by_id(p, GET_GPU_ID(args->handle)); if (!pdd) { pr_err("Process device data doesn't exist\n"); ret = -EINVAL; - goto err_unlock; + goto err_pdd; } mem = kfd_process_device_translate_handle( @@ -1399,7 +1188,7 @@ static int kfd_ioctl_free_memory_of_gpu(struct file *filep, goto err_unlock; } - ret = amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->adev, + ret = amdgpu_amdkfd_gpuvm_free_memory_of_gpu(pdd->dev->adev, (struct kgd_mem *)mem, pdd->drm_priv, &size); /* If freeing the buffer failed, leave the handle in place for @@ -1412,6 +1201,7 @@ static int kfd_ioctl_free_memory_of_gpu(struct file *filep, WRITE_ONCE(pdd->vram_usage, pdd->vram_usage - size); err_unlock: +err_pdd: mutex_unlock(&p->mutex); return ret; } @@ -1422,15 +1212,10 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep, struct kfd_ioctl_map_memory_to_gpu_args *args = data; struct kfd_process_device *pdd, *peer_pdd; void *mem; - struct kfd_dev *dev, *peer; + struct kfd_dev *dev; long err = 0; int i; uint32_t *devices_arr = NULL; - bool table_freed = false; - - dev = kfd_device_by_id(GET_GPU_ID(args->handle)); - if (!dev) - return -EINVAL; if (!args->n_devices) { pr_debug("Device IDs array empty\n"); @@ -1455,6 +1240,12 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep, } mutex_lock(&p->mutex); + pdd = kfd_process_device_data_by_id(p, GET_GPU_ID(args->handle)); + if (!pdd) { + err = -EINVAL; + goto get_process_device_data_failed; + } + dev = pdd->dev; pdd = kfd_bind_process_to_device(dev, p); if (IS_ERR(pdd)) { @@ -1470,25 +1261,33 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep, } for (i = args->n_success; i < args->n_devices; i++) { - peer = kfd_device_by_id(devices_arr[i]); - if (!peer) { + peer_pdd = kfd_process_device_data_by_id(p, devices_arr[i]); + if (!peer_pdd) { pr_debug("Getting device by id failed for 0x%x\n", devices_arr[i]); err = -EINVAL; goto get_mem_obj_from_handle_failed; } - peer_pdd = kfd_bind_process_to_device(peer, p); + peer_pdd = kfd_bind_process_to_device(peer_pdd->dev, p); if (IS_ERR(peer_pdd)) { err = PTR_ERR(peer_pdd); goto get_mem_obj_from_handle_failed; } + err = amdgpu_amdkfd_gpuvm_map_memory_to_gpu( - peer->adev, (struct kgd_mem *)mem, - peer_pdd->drm_priv, &table_freed); + peer_pdd->dev->adev, (struct kgd_mem *)mem, + peer_pdd->drm_priv); if (err) { - pr_err("Failed to map to gpu %d/%d\n", - i, args->n_devices); + struct pci_dev *pdev = peer_pdd->dev->adev->pdev; + + dev_err(dev->adev->dev, + "Failed to map peer:%04x:%02x:%02x.%d mem_domain:%d\n", + pci_domain_nr(pdev->bus), + pdev->bus->number, + PCI_SLOT(pdev->devfn), + PCI_FUNC(pdev->devfn), + ((struct kgd_mem *)mem)->domain); goto map_memory_to_gpu_failed; } args->n_success = i+1; @@ -1503,21 +1302,17 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep, } /* Flush TLBs after waiting for the page table updates to complete */ - if (table_freed) { - for (i = 0; i < args->n_devices; i++) { - peer = kfd_device_by_id(devices_arr[i]); - if (WARN_ON_ONCE(!peer)) - continue; - peer_pdd = kfd_get_process_device_data(peer, p); - if (WARN_ON_ONCE(!peer_pdd)) - continue; - kfd_flush_tlb(peer_pdd, TLB_FLUSH_LEGACY); - } + for (i = 0; i < args->n_devices; i++) { + peer_pdd = kfd_process_device_data_by_id(p, devices_arr[i]); + if (WARN_ON_ONCE(!peer_pdd)) + continue; + kfd_flush_tlb(peer_pdd, TLB_FLUSH_LEGACY); } kfree(devices_arr); return err; +get_process_device_data_failed: bind_process_to_device_failed: get_mem_obj_from_handle_failed: map_memory_to_gpu_failed: @@ -1535,14 +1330,9 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, struct kfd_ioctl_unmap_memory_from_gpu_args *args = data; struct kfd_process_device *pdd, *peer_pdd; void *mem; - struct kfd_dev *dev, *peer; long err = 0; uint32_t *devices_arr = NULL, i; - dev = kfd_device_by_id(GET_GPU_ID(args->handle)); - if (!dev) - return -EINVAL; - if (!args->n_devices) { pr_debug("Device IDs array empty\n"); return -EINVAL; @@ -1566,8 +1356,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, } mutex_lock(&p->mutex); - - pdd = kfd_get_process_device_data(dev, p); + pdd = kfd_process_device_data_by_id(p, GET_GPU_ID(args->handle)); if (!pdd) { err = -EINVAL; goto bind_process_to_device_failed; @@ -1581,19 +1370,13 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, } for (i = args->n_success; i < args->n_devices; i++) { - peer = kfd_device_by_id(devices_arr[i]); - if (!peer) { - err = -EINVAL; - goto get_mem_obj_from_handle_failed; - } - - peer_pdd = kfd_get_process_device_data(peer, p); + peer_pdd = kfd_process_device_data_by_id(p, devices_arr[i]); if (!peer_pdd) { - err = -ENODEV; + err = -EINVAL; goto get_mem_obj_from_handle_failed; } err = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu( - peer->adev, (struct kgd_mem *)mem, peer_pdd->drm_priv); + peer_pdd->dev->adev, (struct kgd_mem *)mem, peer_pdd->drm_priv); if (err) { pr_err("Failed to unmap from gpu %d/%d\n", i, args->n_devices); @@ -1603,8 +1386,8 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, } mutex_unlock(&p->mutex); - if (KFD_GC_VERSION(dev) == IP_VERSION(9, 4, 2)) { - err = amdgpu_amdkfd_gpuvm_sync_memory(dev->adev, + if (kfd_flush_tlb_after_unmap(pdd->dev)) { + err = amdgpu_amdkfd_gpuvm_sync_memory(pdd->dev->adev, (struct kgd_mem *) mem, true); if (err) { pr_debug("Sync memory failed, wait interrupted by user signal\n"); @@ -1613,10 +1396,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep, /* Flush TLBs after waiting for the page table updates to complete */ for (i = 0; i < args->n_devices; i++) { - peer = kfd_device_by_id(devices_arr[i]); - if (WARN_ON_ONCE(!peer)) - continue; - peer_pdd = kfd_get_process_device_data(peer, p); + peer_pdd = kfd_process_device_data_by_id(p, devices_arr[i]); if (WARN_ON_ONCE(!peer_pdd)) continue; kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT); @@ -1736,29 +1516,29 @@ static int kfd_ioctl_import_dmabuf(struct file *filep, struct kfd_ioctl_import_dmabuf_args *args = data; struct kfd_process_device *pdd; struct dma_buf *dmabuf; - struct kfd_dev *dev; int idr_handle; uint64_t size; void *mem; int r; - dev = kfd_device_by_id(args->gpu_id); - if (!dev) - return -EINVAL; - dmabuf = dma_buf_get(args->dmabuf_fd); if (IS_ERR(dmabuf)) return PTR_ERR(dmabuf); mutex_lock(&p->mutex); + pdd = kfd_process_device_data_by_id(p, args->gpu_id); + if (!pdd) { + r = -EINVAL; + goto err_unlock; + } - pdd = kfd_bind_process_to_device(dev, p); + pdd = kfd_bind_process_to_device(pdd->dev, p); if (IS_ERR(pdd)) { r = PTR_ERR(pdd); goto err_unlock; } - r = amdgpu_amdkfd_gpuvm_import_dmabuf(dev->adev, dmabuf, + r = amdgpu_amdkfd_gpuvm_import_dmabuf(pdd->dev->adev, dmabuf, args->va_addr, pdd->drm_priv, (struct kgd_mem **)&mem, &size, NULL); @@ -1779,7 +1559,7 @@ static int kfd_ioctl_import_dmabuf(struct file *filep, return 0; err_free: - amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->adev, (struct kgd_mem *)mem, + amdgpu_amdkfd_gpuvm_free_memory_of_gpu(pdd->dev->adev, (struct kgd_mem *)mem, pdd->drm_priv, NULL); err_unlock: mutex_unlock(&p->mutex); @@ -1792,15 +1572,20 @@ static int kfd_ioctl_smi_events(struct file *filep, struct kfd_process *p, void *data) { struct kfd_ioctl_smi_events_args *args = data; - struct kfd_dev *dev; + struct kfd_process_device *pdd; - dev = kfd_device_by_id(args->gpuid); - if (!dev) + mutex_lock(&p->mutex); + + pdd = kfd_process_device_data_by_id(p, args->gpuid); + mutex_unlock(&p->mutex); + if (!pdd) return -EINVAL; - return kfd_smi_event_open(dev, &args->anon_fd); + return kfd_smi_event_open(pdd->dev, &args->anon_fd); } +#if IS_ENABLED(CONFIG_HSA_AMD_SVM) + static int kfd_ioctl_set_xnack_mode(struct file *filep, struct kfd_process *p, void *data) { @@ -1811,22 +1596,29 @@ static int kfd_ioctl_set_xnack_mode(struct file *filep, if (args->xnack_enabled >= 0) { if (!list_empty(&p->pqm.queues)) { pr_debug("Process has user queues running\n"); - mutex_unlock(&p->mutex); - return -EBUSY; + r = -EBUSY; + goto out_unlock; } - if (args->xnack_enabled && !kfd_process_xnack_mode(p, true)) + + if (p->xnack_enabled == args->xnack_enabled) + goto out_unlock; + + if (args->xnack_enabled && !kfd_process_xnack_mode(p, true)) { r = -EPERM; - else - p->xnack_enabled = args->xnack_enabled; + goto out_unlock; + } + + r = svm_range_switch_xnack_reserve_mem(p, args->xnack_enabled); } else { args->xnack_enabled = p->xnack_enabled; } + +out_unlock: mutex_unlock(&p->mutex); return r; } -#if IS_ENABLED(CONFIG_HSA_AMD_SVM) static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, void *data) { struct kfd_ioctl_svm_args *args = data; @@ -1840,22 +1632,1014 @@ static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, void *data) if (!args->start_addr || !args->size) return -EINVAL; - mutex_lock(&p->mutex); - r = svm_ioctl(p, args->op, args->start_addr, args->size, args->nattr, args->attrs); - mutex_unlock(&p->mutex); - return r; } #else +static int kfd_ioctl_set_xnack_mode(struct file *filep, + struct kfd_process *p, void *data) +{ + return -EPERM; +} static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, void *data) { return -EPERM; } #endif +static int criu_checkpoint_process(struct kfd_process *p, + uint8_t __user *user_priv_data, + uint64_t *priv_offset) +{ + struct kfd_criu_process_priv_data process_priv; + int ret; + + memset(&process_priv, 0, sizeof(process_priv)); + + process_priv.version = KFD_CRIU_PRIV_VERSION; + /* For CR, we don't consider negative xnack mode which is used for + * querying without changing it, here 0 simply means disabled and 1 + * means enabled so retry for finding a valid PTE. + */ + process_priv.xnack_mode = p->xnack_enabled ? 1 : 0; + + ret = copy_to_user(user_priv_data + *priv_offset, + &process_priv, sizeof(process_priv)); + + if (ret) { + pr_err("Failed to copy process information to user\n"); + ret = -EFAULT; + } + + *priv_offset += sizeof(process_priv); + return ret; +} + +static int criu_checkpoint_devices(struct kfd_process *p, + uint32_t num_devices, + uint8_t __user *user_addr, + uint8_t __user *user_priv_data, + uint64_t *priv_offset) +{ + struct kfd_criu_device_priv_data *device_priv = NULL; + struct kfd_criu_device_bucket *device_buckets = NULL; + int ret = 0, i; + + device_buckets = kvzalloc(num_devices * sizeof(*device_buckets), GFP_KERNEL); + if (!device_buckets) { + ret = -ENOMEM; + goto exit; + } + + device_priv = kvzalloc(num_devices * sizeof(*device_priv), GFP_KERNEL); + if (!device_priv) { + ret = -ENOMEM; + goto exit; + } + + for (i = 0; i < num_devices; i++) { + struct kfd_process_device *pdd = p->pdds[i]; + + device_buckets[i].user_gpu_id = pdd->user_gpu_id; + device_buckets[i].actual_gpu_id = pdd->dev->id; + + /* + * priv_data does not contain useful information for now and is reserved for + * future use, so we do not set its contents. + */ + } + + ret = copy_to_user(user_addr, device_buckets, num_devices * sizeof(*device_buckets)); + if (ret) { + pr_err("Failed to copy device information to user\n"); + ret = -EFAULT; + goto exit; + } + + ret = copy_to_user(user_priv_data + *priv_offset, + device_priv, + num_devices * sizeof(*device_priv)); + if (ret) { + pr_err("Failed to copy device information to user\n"); + ret = -EFAULT; + } + *priv_offset += num_devices * sizeof(*device_priv); + +exit: + kvfree(device_buckets); + kvfree(device_priv); + return ret; +} + +static uint32_t get_process_num_bos(struct kfd_process *p) +{ + uint32_t num_of_bos = 0; + int i; + + /* Run over all PDDs of the process */ + for (i = 0; i < p->n_pdds; i++) { + struct kfd_process_device *pdd = p->pdds[i]; + void *mem; + int id; + + idr_for_each_entry(&pdd->alloc_idr, mem, id) { + struct kgd_mem *kgd_mem = (struct kgd_mem *)mem; + + if ((uint64_t)kgd_mem->va > pdd->gpuvm_base) + num_of_bos++; + } + } + return num_of_bos; +} + +static int criu_get_prime_handle(struct drm_gem_object *gobj, int flags, + u32 *shared_fd) +{ + struct dma_buf *dmabuf; + int ret; + + dmabuf = amdgpu_gem_prime_export(gobj, flags); + if (IS_ERR(dmabuf)) { + ret = PTR_ERR(dmabuf); + pr_err("dmabuf export failed for the BO\n"); + return ret; + } + + ret = dma_buf_fd(dmabuf, flags); + if (ret < 0) { + pr_err("dmabuf create fd failed, ret:%d\n", ret); + goto out_free_dmabuf; + } + + *shared_fd = ret; + return 0; + +out_free_dmabuf: + dma_buf_put(dmabuf); + return ret; +} + +static int criu_checkpoint_bos(struct kfd_process *p, + uint32_t num_bos, + uint8_t __user *user_bos, + uint8_t __user *user_priv_data, + uint64_t *priv_offset) +{ + struct kfd_criu_bo_bucket *bo_buckets; + struct kfd_criu_bo_priv_data *bo_privs; + int ret = 0, pdd_index, bo_index = 0, id; + void *mem; + + bo_buckets = kvzalloc(num_bos * sizeof(*bo_buckets), GFP_KERNEL); + if (!bo_buckets) + return -ENOMEM; + + bo_privs = kvzalloc(num_bos * sizeof(*bo_privs), GFP_KERNEL); + if (!bo_privs) { + ret = -ENOMEM; + goto exit; + } + + for (pdd_index = 0; pdd_index < p->n_pdds; pdd_index++) { + struct kfd_process_device *pdd = p->pdds[pdd_index]; + struct amdgpu_bo *dumper_bo; + struct kgd_mem *kgd_mem; + + idr_for_each_entry(&pdd->alloc_idr, mem, id) { + struct kfd_criu_bo_bucket *bo_bucket; + struct kfd_criu_bo_priv_data *bo_priv; + int i, dev_idx = 0; + + if (!mem) { + ret = -ENOMEM; + goto exit; + } + + kgd_mem = (struct kgd_mem *)mem; + dumper_bo = kgd_mem->bo; + + if ((uint64_t)kgd_mem->va <= pdd->gpuvm_base) + continue; + + bo_bucket = &bo_buckets[bo_index]; + bo_priv = &bo_privs[bo_index]; + + bo_bucket->gpu_id = pdd->user_gpu_id; + bo_bucket->addr = (uint64_t)kgd_mem->va; + bo_bucket->size = amdgpu_bo_size(dumper_bo); + bo_bucket->alloc_flags = (uint32_t)kgd_mem->alloc_flags; + bo_priv->idr_handle = id; + + if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) { + ret = amdgpu_ttm_tt_get_userptr(&dumper_bo->tbo, + &bo_priv->user_addr); + if (ret) { + pr_err("Failed to obtain user address for user-pointer bo\n"); + goto exit; + } + } + if (bo_bucket->alloc_flags + & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) { + ret = criu_get_prime_handle(&dumper_bo->tbo.base, + bo_bucket->alloc_flags & + KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ? DRM_RDWR : 0, + &bo_bucket->dmabuf_fd); + if (ret) + goto exit; + } else { + bo_bucket->dmabuf_fd = KFD_INVALID_FD; + } + + if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) + bo_bucket->offset = KFD_MMAP_TYPE_DOORBELL | + KFD_MMAP_GPU_ID(pdd->dev->id); + else if (bo_bucket->alloc_flags & + KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) + bo_bucket->offset = KFD_MMAP_TYPE_MMIO | + KFD_MMAP_GPU_ID(pdd->dev->id); + else + bo_bucket->offset = amdgpu_bo_mmap_offset(dumper_bo); + + for (i = 0; i < p->n_pdds; i++) { + if (amdgpu_amdkfd_bo_mapped_to_dev(p->pdds[i]->dev->adev, kgd_mem)) + bo_priv->mapped_gpuids[dev_idx++] = p->pdds[i]->user_gpu_id; + } + + pr_debug("bo_size = 0x%llx, bo_addr = 0x%llx bo_offset = 0x%llx\n" + "gpu_id = 0x%x alloc_flags = 0x%x idr_handle = 0x%x", + bo_bucket->size, + bo_bucket->addr, + bo_bucket->offset, + bo_bucket->gpu_id, + bo_bucket->alloc_flags, + bo_priv->idr_handle); + bo_index++; + } + } + + ret = copy_to_user(user_bos, bo_buckets, num_bos * sizeof(*bo_buckets)); + if (ret) { + pr_err("Failed to copy BO information to user\n"); + ret = -EFAULT; + goto exit; + } + + ret = copy_to_user(user_priv_data + *priv_offset, bo_privs, num_bos * sizeof(*bo_privs)); + if (ret) { + pr_err("Failed to copy BO priv information to user\n"); + ret = -EFAULT; + goto exit; + } + + *priv_offset += num_bos * sizeof(*bo_privs); + +exit: + while (ret && bo_index--) { + if (bo_buckets[bo_index].alloc_flags + & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) + close_fd(bo_buckets[bo_index].dmabuf_fd); + } + + kvfree(bo_buckets); + kvfree(bo_privs); + return ret; +} + +static int criu_get_process_object_info(struct kfd_process *p, + uint32_t *num_devices, + uint32_t *num_bos, + uint32_t *num_objects, + uint64_t *objs_priv_size) +{ + uint64_t queues_priv_data_size, svm_priv_data_size, priv_size; + uint32_t num_queues, num_events, num_svm_ranges; + int ret; + + *num_devices = p->n_pdds; + *num_bos = get_process_num_bos(p); + + ret = kfd_process_get_queue_info(p, &num_queues, &queues_priv_data_size); + if (ret) + return ret; + + num_events = kfd_get_num_events(p); + + ret = svm_range_get_info(p, &num_svm_ranges, &svm_priv_data_size); + if (ret) + return ret; + + *num_objects = num_queues + num_events + num_svm_ranges; + + if (objs_priv_size) { + priv_size = sizeof(struct kfd_criu_process_priv_data); + priv_size += *num_devices * sizeof(struct kfd_criu_device_priv_data); + priv_size += *num_bos * sizeof(struct kfd_criu_bo_priv_data); + priv_size += queues_priv_data_size; + priv_size += num_events * sizeof(struct kfd_criu_event_priv_data); + priv_size += svm_priv_data_size; + *objs_priv_size = priv_size; + } + return 0; +} + +static int criu_checkpoint(struct file *filep, + struct kfd_process *p, + struct kfd_ioctl_criu_args *args) +{ + int ret; + uint32_t num_devices, num_bos, num_objects; + uint64_t priv_size, priv_offset = 0, bo_priv_offset; + + if (!args->devices || !args->bos || !args->priv_data) + return -EINVAL; + + mutex_lock(&p->mutex); + + if (!p->n_pdds) { + pr_err("No pdd for given process\n"); + ret = -ENODEV; + goto exit_unlock; + } + + /* Confirm all process queues are evicted */ + if (!p->queues_paused) { + pr_err("Cannot dump process when queues are not in evicted state\n"); + /* CRIU plugin did not call op PROCESS_INFO before checkpointing */ + ret = -EINVAL; + goto exit_unlock; + } + + ret = criu_get_process_object_info(p, &num_devices, &num_bos, &num_objects, &priv_size); + if (ret) + goto exit_unlock; + + if (num_devices != args->num_devices || + num_bos != args->num_bos || + num_objects != args->num_objects || + priv_size != args->priv_data_size) { + + ret = -EINVAL; + goto exit_unlock; + } + + /* each function will store private data inside priv_data and adjust priv_offset */ + ret = criu_checkpoint_process(p, (uint8_t __user *)args->priv_data, &priv_offset); + if (ret) + goto exit_unlock; + + ret = criu_checkpoint_devices(p, num_devices, (uint8_t __user *)args->devices, + (uint8_t __user *)args->priv_data, &priv_offset); + if (ret) + goto exit_unlock; + + /* Leave room for BOs in the private data. They need to be restored + * before events, but we checkpoint them last to simplify the error + * handling. + */ + bo_priv_offset = priv_offset; + priv_offset += num_bos * sizeof(struct kfd_criu_bo_priv_data); + + if (num_objects) { + ret = kfd_criu_checkpoint_queues(p, (uint8_t __user *)args->priv_data, + &priv_offset); + if (ret) + goto exit_unlock; + + ret = kfd_criu_checkpoint_events(p, (uint8_t __user *)args->priv_data, + &priv_offset); + if (ret) + goto exit_unlock; + + ret = kfd_criu_checkpoint_svm(p, (uint8_t __user *)args->priv_data, &priv_offset); + if (ret) + goto exit_unlock; + } + + /* This must be the last thing in this function that can fail. + * Otherwise we leak dmabuf file descriptors. + */ + ret = criu_checkpoint_bos(p, num_bos, (uint8_t __user *)args->bos, + (uint8_t __user *)args->priv_data, &bo_priv_offset); + +exit_unlock: + mutex_unlock(&p->mutex); + if (ret) + pr_err("Failed to dump CRIU ret:%d\n", ret); + else + pr_debug("CRIU dump ret:%d\n", ret); + + return ret; +} + +static int criu_restore_process(struct kfd_process *p, + struct kfd_ioctl_criu_args *args, + uint64_t *priv_offset, + uint64_t max_priv_data_size) +{ + int ret = 0; + struct kfd_criu_process_priv_data process_priv; + + if (*priv_offset + sizeof(process_priv) > max_priv_data_size) + return -EINVAL; + + ret = copy_from_user(&process_priv, + (void __user *)(args->priv_data + *priv_offset), + sizeof(process_priv)); + if (ret) { + pr_err("Failed to copy process private information from user\n"); + ret = -EFAULT; + goto exit; + } + *priv_offset += sizeof(process_priv); + + if (process_priv.version != KFD_CRIU_PRIV_VERSION) { + pr_err("Invalid CRIU API version (checkpointed:%d current:%d)\n", + process_priv.version, KFD_CRIU_PRIV_VERSION); + return -EINVAL; + } + + pr_debug("Setting XNACK mode\n"); + if (process_priv.xnack_mode && !kfd_process_xnack_mode(p, true)) { + pr_err("xnack mode cannot be set\n"); + ret = -EPERM; + goto exit; + } else { + pr_debug("set xnack mode: %d\n", process_priv.xnack_mode); + p->xnack_enabled = process_priv.xnack_mode; + } + +exit: + return ret; +} + +static int criu_restore_devices(struct kfd_process *p, + struct kfd_ioctl_criu_args *args, + uint64_t *priv_offset, + uint64_t max_priv_data_size) +{ + struct kfd_criu_device_bucket *device_buckets; + struct kfd_criu_device_priv_data *device_privs; + int ret = 0; + uint32_t i; + + if (args->num_devices != p->n_pdds) + return -EINVAL; + + if (*priv_offset + (args->num_devices * sizeof(*device_privs)) > max_priv_data_size) + return -EINVAL; + + device_buckets = kmalloc_array(args->num_devices, sizeof(*device_buckets), GFP_KERNEL); + if (!device_buckets) + return -ENOMEM; + + ret = copy_from_user(device_buckets, (void __user *)args->devices, + args->num_devices * sizeof(*device_buckets)); + if (ret) { + pr_err("Failed to copy devices buckets from user\n"); + ret = -EFAULT; + goto exit; + } + + for (i = 0; i < args->num_devices; i++) { + struct kfd_dev *dev; + struct kfd_process_device *pdd; + struct file *drm_file; + + /* device private data is not currently used */ + + if (!device_buckets[i].user_gpu_id) { + pr_err("Invalid user gpu_id\n"); + ret = -EINVAL; + goto exit; + } + + dev = kfd_device_by_id(device_buckets[i].actual_gpu_id); + if (!dev) { + pr_err("Failed to find device with gpu_id = %x\n", + device_buckets[i].actual_gpu_id); + ret = -EINVAL; + goto exit; + } + + pdd = kfd_get_process_device_data(dev, p); + if (!pdd) { + pr_err("Failed to get pdd for gpu_id = %x\n", + device_buckets[i].actual_gpu_id); + ret = -EINVAL; + goto exit; + } + pdd->user_gpu_id = device_buckets[i].user_gpu_id; + + drm_file = fget(device_buckets[i].drm_fd); + if (!drm_file) { + pr_err("Invalid render node file descriptor sent from plugin (%d)\n", + device_buckets[i].drm_fd); + ret = -EINVAL; + goto exit; + } + + if (pdd->drm_file) { + ret = -EINVAL; + goto exit; + } + + /* create the vm using render nodes for kfd pdd */ + if (kfd_process_device_init_vm(pdd, drm_file)) { + pr_err("could not init vm for given pdd\n"); + /* On success, the PDD keeps the drm_file reference */ + fput(drm_file); + ret = -EINVAL; + goto exit; + } + /* + * pdd now already has the vm bound to render node so below api won't create a new + * exclusive kfd mapping but use existing one with renderDXXX but is still needed + * for iommu v2 binding and runtime pm. + */ + pdd = kfd_bind_process_to_device(dev, p); + if (IS_ERR(pdd)) { + ret = PTR_ERR(pdd); + goto exit; + } + + if (!pdd->doorbell_index && + kfd_alloc_process_doorbells(pdd->dev, &pdd->doorbell_index) < 0) { + ret = -ENOMEM; + goto exit; + } + } + + /* + * We are not copying device private data from user as we are not using the data for now, + * but we still adjust for its private data. + */ + *priv_offset += args->num_devices * sizeof(*device_privs); + +exit: + kfree(device_buckets); + return ret; +} + +static int criu_restore_memory_of_gpu(struct kfd_process_device *pdd, + struct kfd_criu_bo_bucket *bo_bucket, + struct kfd_criu_bo_priv_data *bo_priv, + struct kgd_mem **kgd_mem) +{ + int idr_handle; + int ret; + const bool criu_resume = true; + u64 offset; + + if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) { + if (bo_bucket->size != kfd_doorbell_process_slice(pdd->dev)) + return -EINVAL; + + offset = kfd_get_process_doorbells(pdd); + if (!offset) + return -ENOMEM; + } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) { + /* MMIO BOs need remapped bus address */ + if (bo_bucket->size != PAGE_SIZE) { + pr_err("Invalid page size\n"); + return -EINVAL; + } + offset = pdd->dev->adev->rmmio_remap.bus_addr; + if (!offset) { + pr_err("amdgpu_amdkfd_get_mmio_remap_phys_addr failed\n"); + return -ENOMEM; + } + } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) { + offset = bo_priv->user_addr; + } + /* Create the BO */ + ret = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(pdd->dev->adev, bo_bucket->addr, + bo_bucket->size, pdd->drm_priv, kgd_mem, + &offset, bo_bucket->alloc_flags, criu_resume); + if (ret) { + pr_err("Could not create the BO\n"); + return ret; + } + pr_debug("New BO created: size:0x%llx addr:0x%llx offset:0x%llx\n", + bo_bucket->size, bo_bucket->addr, offset); + + /* Restore previous IDR handle */ + pr_debug("Restoring old IDR handle for the BO"); + idr_handle = idr_alloc(&pdd->alloc_idr, *kgd_mem, bo_priv->idr_handle, + bo_priv->idr_handle + 1, GFP_KERNEL); + + if (idr_handle < 0) { + pr_err("Could not allocate idr\n"); + amdgpu_amdkfd_gpuvm_free_memory_of_gpu(pdd->dev->adev, *kgd_mem, pdd->drm_priv, + NULL); + return -ENOMEM; + } + + if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) + bo_bucket->restored_offset = KFD_MMAP_TYPE_DOORBELL | KFD_MMAP_GPU_ID(pdd->dev->id); + if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) { + bo_bucket->restored_offset = KFD_MMAP_TYPE_MMIO | KFD_MMAP_GPU_ID(pdd->dev->id); + } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_GTT) { + bo_bucket->restored_offset = offset; + } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) { + bo_bucket->restored_offset = offset; + /* Update the VRAM usage count */ + WRITE_ONCE(pdd->vram_usage, pdd->vram_usage + bo_bucket->size); + } + return 0; +} + +static int criu_restore_bo(struct kfd_process *p, + struct kfd_criu_bo_bucket *bo_bucket, + struct kfd_criu_bo_priv_data *bo_priv) +{ + struct kfd_process_device *pdd; + struct kgd_mem *kgd_mem; + int ret; + int j; + + pr_debug("Restoring BO size:0x%llx addr:0x%llx gpu_id:0x%x flags:0x%x idr_handle:0x%x\n", + bo_bucket->size, bo_bucket->addr, bo_bucket->gpu_id, bo_bucket->alloc_flags, + bo_priv->idr_handle); + + pdd = kfd_process_device_data_by_id(p, bo_bucket->gpu_id); + if (!pdd) { + pr_err("Failed to get pdd\n"); + return -ENODEV; + } + + ret = criu_restore_memory_of_gpu(pdd, bo_bucket, bo_priv, &kgd_mem); + if (ret) + return ret; + + /* now map these BOs to GPU/s */ + for (j = 0; j < p->n_pdds; j++) { + struct kfd_dev *peer; + struct kfd_process_device *peer_pdd; + + if (!bo_priv->mapped_gpuids[j]) + break; + + peer_pdd = kfd_process_device_data_by_id(p, bo_priv->mapped_gpuids[j]); + if (!peer_pdd) + return -EINVAL; + + peer = peer_pdd->dev; + + peer_pdd = kfd_bind_process_to_device(peer, p); + if (IS_ERR(peer_pdd)) + return PTR_ERR(peer_pdd); + + ret = amdgpu_amdkfd_gpuvm_map_memory_to_gpu(peer->adev, kgd_mem, + peer_pdd->drm_priv); + if (ret) { + pr_err("Failed to map to gpu %d/%d\n", j, p->n_pdds); + return ret; + } + } + + pr_debug("map memory was successful for the BO\n"); + /* create the dmabuf object and export the bo */ + if (bo_bucket->alloc_flags + & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) { + ret = criu_get_prime_handle(&kgd_mem->bo->tbo.base, DRM_RDWR, + &bo_bucket->dmabuf_fd); + if (ret) + return ret; + } else { + bo_bucket->dmabuf_fd = KFD_INVALID_FD; + } + + return 0; +} + +static int criu_restore_bos(struct kfd_process *p, + struct kfd_ioctl_criu_args *args, + uint64_t *priv_offset, + uint64_t max_priv_data_size) +{ + struct kfd_criu_bo_bucket *bo_buckets = NULL; + struct kfd_criu_bo_priv_data *bo_privs = NULL; + int ret = 0; + uint32_t i = 0; + + if (*priv_offset + (args->num_bos * sizeof(*bo_privs)) > max_priv_data_size) + return -EINVAL; + + /* Prevent MMU notifications until stage-4 IOCTL (CRIU_RESUME) is received */ + amdgpu_amdkfd_block_mmu_notifications(p->kgd_process_info); + + bo_buckets = kvmalloc_array(args->num_bos, sizeof(*bo_buckets), GFP_KERNEL); + if (!bo_buckets) + return -ENOMEM; + + ret = copy_from_user(bo_buckets, (void __user *)args->bos, + args->num_bos * sizeof(*bo_buckets)); + if (ret) { + pr_err("Failed to copy BOs information from user\n"); + ret = -EFAULT; + goto exit; + } + + bo_privs = kvmalloc_array(args->num_bos, sizeof(*bo_privs), GFP_KERNEL); + if (!bo_privs) { + ret = -ENOMEM; + goto exit; + } + + ret = copy_from_user(bo_privs, (void __user *)args->priv_data + *priv_offset, + args->num_bos * sizeof(*bo_privs)); + if (ret) { + pr_err("Failed to copy BOs information from user\n"); + ret = -EFAULT; + goto exit; + } + *priv_offset += args->num_bos * sizeof(*bo_privs); + + /* Create and map new BOs */ + for (; i < args->num_bos; i++) { + ret = criu_restore_bo(p, &bo_buckets[i], &bo_privs[i]); + if (ret) { + pr_debug("Failed to restore BO[%d] ret%d\n", i, ret); + goto exit; + } + } /* done */ + + /* Copy only the buckets back so user can read bo_buckets[N].restored_offset */ + ret = copy_to_user((void __user *)args->bos, + bo_buckets, + (args->num_bos * sizeof(*bo_buckets))); + if (ret) + ret = -EFAULT; + +exit: + while (ret && i--) { + if (bo_buckets[i].alloc_flags + & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) + close_fd(bo_buckets[i].dmabuf_fd); + } + kvfree(bo_buckets); + kvfree(bo_privs); + return ret; +} + +static int criu_restore_objects(struct file *filep, + struct kfd_process *p, + struct kfd_ioctl_criu_args *args, + uint64_t *priv_offset, + uint64_t max_priv_data_size) +{ + int ret = 0; + uint32_t i; + + BUILD_BUG_ON(offsetof(struct kfd_criu_queue_priv_data, object_type)); + BUILD_BUG_ON(offsetof(struct kfd_criu_event_priv_data, object_type)); + BUILD_BUG_ON(offsetof(struct kfd_criu_svm_range_priv_data, object_type)); + + for (i = 0; i < args->num_objects; i++) { + uint32_t object_type; + + if (*priv_offset + sizeof(object_type) > max_priv_data_size) { + pr_err("Invalid private data size\n"); + return -EINVAL; + } + + ret = get_user(object_type, (uint32_t __user *)(args->priv_data + *priv_offset)); + if (ret) { + pr_err("Failed to copy private information from user\n"); + goto exit; + } + + switch (object_type) { + case KFD_CRIU_OBJECT_TYPE_QUEUE: + ret = kfd_criu_restore_queue(p, (uint8_t __user *)args->priv_data, + priv_offset, max_priv_data_size); + if (ret) + goto exit; + break; + case KFD_CRIU_OBJECT_TYPE_EVENT: + ret = kfd_criu_restore_event(filep, p, (uint8_t __user *)args->priv_data, + priv_offset, max_priv_data_size); + if (ret) + goto exit; + break; + case KFD_CRIU_OBJECT_TYPE_SVM_RANGE: + ret = kfd_criu_restore_svm(p, (uint8_t __user *)args->priv_data, + priv_offset, max_priv_data_size); + if (ret) + goto exit; + break; + default: + pr_err("Invalid object type:%u at index:%d\n", object_type, i); + ret = -EINVAL; + goto exit; + } + } +exit: + return ret; +} + +static int criu_restore(struct file *filep, + struct kfd_process *p, + struct kfd_ioctl_criu_args *args) +{ + uint64_t priv_offset = 0; + int ret = 0; + + pr_debug("CRIU restore (num_devices:%u num_bos:%u num_objects:%u priv_data_size:%llu)\n", + args->num_devices, args->num_bos, args->num_objects, args->priv_data_size); + + if (!args->bos || !args->devices || !args->priv_data || !args->priv_data_size || + !args->num_devices || !args->num_bos) + return -EINVAL; + + mutex_lock(&p->mutex); + + /* + * Set the process to evicted state to avoid running any new queues before all the memory + * mappings are ready. + */ + ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_CRIU_RESTORE); + if (ret) + goto exit_unlock; + + /* Each function will adjust priv_offset based on how many bytes they consumed */ + ret = criu_restore_process(p, args, &priv_offset, args->priv_data_size); + if (ret) + goto exit_unlock; + + ret = criu_restore_devices(p, args, &priv_offset, args->priv_data_size); + if (ret) + goto exit_unlock; + + ret = criu_restore_bos(p, args, &priv_offset, args->priv_data_size); + if (ret) + goto exit_unlock; + + ret = criu_restore_objects(filep, p, args, &priv_offset, args->priv_data_size); + if (ret) + goto exit_unlock; + + if (priv_offset != args->priv_data_size) { + pr_err("Invalid private data size\n"); + ret = -EINVAL; + } + +exit_unlock: + mutex_unlock(&p->mutex); + if (ret) + pr_err("Failed to restore CRIU ret:%d\n", ret); + else + pr_debug("CRIU restore successful\n"); + + return ret; +} + +static int criu_unpause(struct file *filep, + struct kfd_process *p, + struct kfd_ioctl_criu_args *args) +{ + int ret; + + mutex_lock(&p->mutex); + + if (!p->queues_paused) { + mutex_unlock(&p->mutex); + return -EINVAL; + } + + ret = kfd_process_restore_queues(p); + if (ret) + pr_err("Failed to unpause queues ret:%d\n", ret); + else + p->queues_paused = false; + + mutex_unlock(&p->mutex); + + return ret; +} + +static int criu_resume(struct file *filep, + struct kfd_process *p, + struct kfd_ioctl_criu_args *args) +{ + struct kfd_process *target = NULL; + struct pid *pid = NULL; + int ret = 0; + + pr_debug("Inside %s, target pid for criu restore: %d\n", __func__, + args->pid); + + pid = find_get_pid(args->pid); + if (!pid) { + pr_err("Cannot find pid info for %i\n", args->pid); + return -ESRCH; + } + + pr_debug("calling kfd_lookup_process_by_pid\n"); + target = kfd_lookup_process_by_pid(pid); + + put_pid(pid); + + if (!target) { + pr_debug("Cannot find process info for %i\n", args->pid); + return -ESRCH; + } + + mutex_lock(&target->mutex); + ret = kfd_criu_resume_svm(target); + if (ret) { + pr_err("kfd_criu_resume_svm failed for %i\n", args->pid); + goto exit; + } + + ret = amdgpu_amdkfd_criu_resume(target->kgd_process_info); + if (ret) + pr_err("amdgpu_amdkfd_criu_resume failed for %i\n", args->pid); + +exit: + mutex_unlock(&target->mutex); + + kfd_unref_process(target); + return ret; +} + +static int criu_process_info(struct file *filep, + struct kfd_process *p, + struct kfd_ioctl_criu_args *args) +{ + int ret = 0; + + mutex_lock(&p->mutex); + + if (!p->n_pdds) { + pr_err("No pdd for given process\n"); + ret = -ENODEV; + goto err_unlock; + } + + ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_CRIU_CHECKPOINT); + if (ret) + goto err_unlock; + + p->queues_paused = true; + + args->pid = task_pid_nr_ns(p->lead_thread, + task_active_pid_ns(p->lead_thread)); + + ret = criu_get_process_object_info(p, &args->num_devices, &args->num_bos, + &args->num_objects, &args->priv_data_size); + if (ret) + goto err_unlock; + + dev_dbg(kfd_device, "Num of devices:%u bos:%u objects:%u priv_data_size:%lld\n", + args->num_devices, args->num_bos, args->num_objects, + args->priv_data_size); + +err_unlock: + if (ret) { + kfd_process_restore_queues(p); + p->queues_paused = false; + } + mutex_unlock(&p->mutex); + return ret; +} + +static int kfd_ioctl_criu(struct file *filep, struct kfd_process *p, void *data) +{ + struct kfd_ioctl_criu_args *args = data; + int ret; + + dev_dbg(kfd_device, "CRIU operation: %d\n", args->op); + switch (args->op) { + case KFD_CRIU_OP_PROCESS_INFO: + ret = criu_process_info(filep, p, args); + break; + case KFD_CRIU_OP_CHECKPOINT: + ret = criu_checkpoint(filep, p, args); + break; + case KFD_CRIU_OP_UNPAUSE: + ret = criu_unpause(filep, p, args); + break; + case KFD_CRIU_OP_RESTORE: + ret = criu_restore(filep, p, args); + break; + case KFD_CRIU_OP_RESUME: + ret = criu_resume(filep, p, args); + break; + default: + dev_dbg(kfd_device, "Unsupported CRIU operation:%d\n", args->op); + ret = -EINVAL; + break; + } + + if (ret) + dev_dbg(kfd_device, "CRIU operation:%d err:%d\n", args->op, ret); + + return ret; +} + #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \ [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \ .cmd_drv = 0, .name = #ioctl} @@ -1898,16 +2682,16 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { AMDKFD_IOCTL_DEF(AMDKFD_IOC_WAIT_EVENTS, kfd_ioctl_wait_events, 0), - AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_REGISTER, + AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_REGISTER_DEPRECATED, kfd_ioctl_dbg_register, 0), - AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_UNREGISTER, + AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_UNREGISTER_DEPRECATED, kfd_ioctl_dbg_unregister, 0), - AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_ADDRESS_WATCH, + AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_ADDRESS_WATCH_DEPRECATED, kfd_ioctl_dbg_address_watch, 0), - AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_WAVE_CONTROL, + AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_WAVE_CONTROL_DEPRECATED, kfd_ioctl_dbg_wave_control, 0), AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_SCRATCH_BACKING_VA, @@ -1959,6 +2743,12 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_XNACK_MODE, kfd_ioctl_set_xnack_mode, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_CRIU_OP, + kfd_ioctl_criu, KFD_IOC_FLAG_CHECKPOINT_RESTORE), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_AVAILABLE_MEMORY, + kfd_ioctl_get_available_memory, 0), }; #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) @@ -1973,6 +2763,7 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) char *kdata = NULL; unsigned int usize, asize; int retcode = -EINVAL; + bool ptrace_attached = false; if (nr >= AMDKFD_CORE_IOCTL_COUNT) goto err_i1; @@ -1998,7 +2789,15 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) * processes need to create their own KFD device context. */ process = filep->private_data; - if (process->lead_thread != current->group_leader) { + + rcu_read_lock(); + if ((ioctl->flags & KFD_IOC_FLAG_CHECKPOINT_RESTORE) && + ptrace_parent(process->lead_thread) == current) + ptrace_attached = true; + rcu_read_unlock(); + + if (process->lead_thread != current->group_leader + && !ptrace_attached) { dev_dbg(kfd_device, "Using KFD FD in wrong process\n"); retcode = -EBADF; goto err_i1; @@ -2013,6 +2812,19 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) goto err_i1; } + /* + * Versions of docker shipped in Ubuntu 18.xx and 20.xx do not support + * CAP_CHECKPOINT_RESTORE, so we also allow access if CAP_SYS_ADMIN as CAP_SYS_ADMIN is a + * more priviledged access. + */ + if (unlikely(ioctl->flags & KFD_IOC_FLAG_CHECKPOINT_RESTORE)) { + if (!capable(CAP_CHECKPOINT_RESTORE) && + !capable(CAP_SYS_ADMIN)) { + retcode = -EACCES; + goto err_i1; + } + } + if (cmd & (IOC_IN | IOC_OUT)) { if (asize <= sizeof(stack_kdata)) { kdata = stack_kdata; @@ -2061,7 +2873,6 @@ static int kfd_mmio_mmap(struct kfd_dev *dev, struct kfd_process *process, struct vm_area_struct *vma) { phys_addr_t address; - int ret; if (vma->vm_end - vma->vm_start != PAGE_SIZE) return -EINVAL; @@ -2081,12 +2892,11 @@ static int kfd_mmio_mmap(struct kfd_dev *dev, struct kfd_process *process, process->pasid, (unsigned long long) vma->vm_start, address, vma->vm_flags, PAGE_SIZE); - ret = io_remap_pfn_range(vma, + return io_remap_pfn_range(vma, vma->vm_start, address >> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot); - return ret; } |