aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_chardev.c')
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_chardev.c1746
1 files changed, 1278 insertions, 468 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 24ebd61395d8..6d291aa6386b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
/*
- * Copyright 2014 Advanced Micro Devices, Inc.
+ * Copyright 2014-2022 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -33,14 +34,16 @@
#include <linux/time.h>
#include <linux/mm.h>
#include <linux/mman.h>
+#include <linux/ptrace.h>
#include <linux/dma-buf.h>
-#include <asm/processor.h>
+#include <linux/fdtable.h>
+#include <linux/processor.h>
#include "kfd_priv.h"
#include "kfd_device_queue_manager.h"
-#include "kfd_dbgmgr.h"
#include "kfd_svm.h"
#include "amdgpu_amdkfd.h"
#include "kfd_smi_events.h"
+#include "amdgpu_dma_buf.h"
static long kfd_ioctl(struct file *, unsigned int, unsigned long);
static int kfd_open(struct inode *, struct file *);
@@ -62,6 +65,25 @@ static int kfd_char_dev_major = -1;
static struct class *kfd_class;
struct device *kfd_device;
+static inline struct kfd_process_device *kfd_lock_pdd_by_id(struct kfd_process *p, __u32 gpu_id)
+{
+ struct kfd_process_device *pdd;
+
+ mutex_lock(&p->mutex);
+ pdd = kfd_process_device_data_by_id(p, gpu_id);
+
+ if (pdd)
+ return pdd;
+
+ mutex_unlock(&p->mutex);
+ return NULL;
+}
+
+static inline void kfd_unlock_pdd(struct kfd_process_device *pdd)
+{
+ mutex_unlock(&pdd->process->mutex);
+}
+
int kfd_chardev_init(void)
{
int err = 0;
@@ -101,11 +123,6 @@ void kfd_chardev_exit(void)
kfd_device = NULL;
}
-struct device *kfd_chardev(void)
-{
- return kfd_device;
-}
-
static int kfd_open(struct inode *inode, struct file *filep)
{
@@ -282,6 +299,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
struct kfd_process_device *pdd;
struct queue_properties q_properties;
uint32_t doorbell_offset_in_process = 0;
+ struct amdgpu_bo *wptr_bo = NULL;
memset(&q_properties, 0, sizeof(struct queue_properties));
@@ -292,26 +310,72 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
return err;
pr_debug("Looking for gpu id 0x%x\n", args->gpu_id);
- dev = kfd_device_by_id(args->gpu_id);
- if (!dev) {
- pr_debug("Could not find gpu id 0x%x\n", args->gpu_id);
- return -EINVAL;
- }
mutex_lock(&p->mutex);
+ pdd = kfd_process_device_data_by_id(p, args->gpu_id);
+ if (!pdd) {
+ pr_debug("Could not find gpu id 0x%x\n", args->gpu_id);
+ err = -EINVAL;
+ goto err_pdd;
+ }
+ dev = pdd->dev;
+
pdd = kfd_bind_process_to_device(dev, p);
if (IS_ERR(pdd)) {
err = -ESRCH;
goto err_bind_process;
}
+ if (!pdd->doorbell_index &&
+ kfd_alloc_process_doorbells(dev, &pdd->doorbell_index) < 0) {
+ err = -ENOMEM;
+ goto err_alloc_doorbells;
+ }
+
+ /* Starting with GFX11, wptr BOs must be mapped to GART for MES to determine work
+ * on unmapped queues for usermode queue oversubscription (no aggregated doorbell)
+ */
+ if (dev->shared_resources.enable_mes &&
+ ((dev->adev->mes.sched_version & AMDGPU_MES_API_VERSION_MASK)
+ >> AMDGPU_MES_API_VERSION_SHIFT) >= 2) {
+ struct amdgpu_bo_va_mapping *wptr_mapping;
+ struct amdgpu_vm *wptr_vm;
+
+ wptr_vm = drm_priv_to_vm(pdd->drm_priv);
+ err = amdgpu_bo_reserve(wptr_vm->root.bo, false);
+ if (err)
+ goto err_wptr_map_gart;
+
+ wptr_mapping = amdgpu_vm_bo_lookup_mapping(
+ wptr_vm, args->write_pointer_address >> PAGE_SHIFT);
+ amdgpu_bo_unreserve(wptr_vm->root.bo);
+ if (!wptr_mapping) {
+ pr_err("Failed to lookup wptr bo\n");
+ err = -EINVAL;
+ goto err_wptr_map_gart;
+ }
+
+ wptr_bo = wptr_mapping->bo_va->base.bo;
+ if (wptr_bo->tbo.base.size > PAGE_SIZE) {
+ pr_err("Requested GART mapping for wptr bo larger than one page\n");
+ err = -EINVAL;
+ goto err_wptr_map_gart;
+ }
+
+ err = amdgpu_amdkfd_map_gtt_bo_to_gart(dev->adev, wptr_bo);
+ if (err) {
+ pr_err("Failed to map wptr bo to GART\n");
+ goto err_wptr_map_gart;
+ }
+ }
+
pr_debug("Creating queue for PASID 0x%x on gpu 0x%x\n",
p->pasid,
dev->id);
- err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, &queue_id,
- &doorbell_offset_in_process);
+ err = pqm_create_queue(&p->pqm, dev, filep, &q_properties, &queue_id, wptr_bo,
+ NULL, NULL, NULL, &doorbell_offset_in_process);
if (err != 0)
goto err_create_queue;
@@ -321,7 +385,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
/* Return gpu_id as doorbell offset for mmap usage */
args->doorbell_offset = KFD_MMAP_TYPE_DOORBELL;
args->doorbell_offset |= KFD_MMAP_GPU_ID(args->gpu_id);
- if (KFD_IS_SOC15(dev->device_info->asic_family))
+ if (KFD_IS_SOC15(dev))
/* On SOC15 ASICs, include the doorbell offset within the
* process doorbell frame, which is 2 pages.
*/
@@ -343,7 +407,12 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
return 0;
err_create_queue:
+ if (wptr_bo)
+ amdgpu_amdkfd_free_gtt_mem(dev->adev, wptr_bo);
+err_wptr_map_gart:
+err_alloc_doorbells:
err_bind_process:
+err_pdd:
mutex_unlock(&p->mutex);
return err;
}
@@ -490,7 +559,6 @@ static int kfd_ioctl_set_memory_policy(struct file *filep,
struct kfd_process *p, void *data)
{
struct kfd_ioctl_set_memory_policy_args *args = data;
- struct kfd_dev *dev;
int err = 0;
struct kfd_process_device *pdd;
enum cache_policy default_policy, alternate_policy;
@@ -505,13 +573,15 @@ static int kfd_ioctl_set_memory_policy(struct file *filep,
return -EINVAL;
}
- dev = kfd_device_by_id(args->gpu_id);
- if (!dev)
- return -EINVAL;
-
mutex_lock(&p->mutex);
+ pdd = kfd_process_device_data_by_id(p, args->gpu_id);
+ if (!pdd) {
+ pr_debug("Could not find gpu id 0x%x\n", args->gpu_id);
+ err = -EINVAL;
+ goto err_pdd;
+ }
- pdd = kfd_bind_process_to_device(dev, p);
+ pdd = kfd_bind_process_to_device(pdd->dev, p);
if (IS_ERR(pdd)) {
err = -ESRCH;
goto out;
@@ -524,7 +594,7 @@ static int kfd_ioctl_set_memory_policy(struct file *filep,
(args->alternate_policy == KFD_IOC_CACHE_POLICY_COHERENT)
? cache_policy_coherent : cache_policy_noncoherent;
- if (!dev->dqm->ops.set_cache_memory_policy(dev->dqm,
+ if (!pdd->dev->dqm->ops.set_cache_memory_policy(pdd->dev->dqm,
&pdd->qpd,
default_policy,
alternate_policy,
@@ -533,6 +603,7 @@ static int kfd_ioctl_set_memory_policy(struct file *filep,
err = -EINVAL;
out:
+err_pdd:
mutex_unlock(&p->mutex);
return err;
@@ -542,17 +613,18 @@ static int kfd_ioctl_set_trap_handler(struct file *filep,
struct kfd_process *p, void *data)
{
struct kfd_ioctl_set_trap_handler_args *args = data;
- struct kfd_dev *dev;
int err = 0;
struct kfd_process_device *pdd;
- dev = kfd_device_by_id(args->gpu_id);
- if (!dev)
- return -EINVAL;
-
mutex_lock(&p->mutex);
- pdd = kfd_bind_process_to_device(dev, p);
+ pdd = kfd_process_device_data_by_id(p, args->gpu_id);
+ if (!pdd) {
+ err = -EINVAL;
+ goto err_pdd;
+ }
+
+ pdd = kfd_bind_process_to_device(pdd->dev, p);
if (IS_ERR(pdd)) {
err = -ESRCH;
goto out;
@@ -561,6 +633,7 @@ static int kfd_ioctl_set_trap_handler(struct file *filep,
kfd_process_set_trap_handler(&pdd->qpd, args->tba_addr, args->tma_addr);
out:
+err_pdd:
mutex_unlock(&p->mutex);
return err;
@@ -569,289 +642,40 @@ out:
static int kfd_ioctl_dbg_register(struct file *filep,
struct kfd_process *p, void *data)
{
- struct kfd_ioctl_dbg_register_args *args = data;
- struct kfd_dev *dev;
- struct kfd_dbgmgr *dbgmgr_ptr;
- struct kfd_process_device *pdd;
- bool create_ok;
- long status = 0;
-
- dev = kfd_device_by_id(args->gpu_id);
- if (!dev)
- return -EINVAL;
-
- if (dev->device_info->asic_family == CHIP_CARRIZO) {
- pr_debug("kfd_ioctl_dbg_register not supported on CZ\n");
- return -EINVAL;
- }
-
- mutex_lock(&p->mutex);
- mutex_lock(kfd_get_dbgmgr_mutex());
-
- /*
- * make sure that we have pdd, if this the first queue created for
- * this process
- */
- pdd = kfd_bind_process_to_device(dev, p);
- if (IS_ERR(pdd)) {
- status = PTR_ERR(pdd);
- goto out;
- }
-
- if (!dev->dbgmgr) {
- /* In case of a legal call, we have no dbgmgr yet */
- create_ok = kfd_dbgmgr_create(&dbgmgr_ptr, dev);
- if (create_ok) {
- status = kfd_dbgmgr_register(dbgmgr_ptr, p);
- if (status != 0)
- kfd_dbgmgr_destroy(dbgmgr_ptr);
- else
- dev->dbgmgr = dbgmgr_ptr;
- }
- } else {
- pr_debug("debugger already registered\n");
- status = -EINVAL;
- }
-
-out:
- mutex_unlock(kfd_get_dbgmgr_mutex());
- mutex_unlock(&p->mutex);
-
- return status;
+ return -EPERM;
}
static int kfd_ioctl_dbg_unregister(struct file *filep,
struct kfd_process *p, void *data)
{
- struct kfd_ioctl_dbg_unregister_args *args = data;
- struct kfd_dev *dev;
- long status;
-
- dev = kfd_device_by_id(args->gpu_id);
- if (!dev || !dev->dbgmgr)
- return -EINVAL;
-
- if (dev->device_info->asic_family == CHIP_CARRIZO) {
- pr_debug("kfd_ioctl_dbg_unregister not supported on CZ\n");
- return -EINVAL;
- }
-
- mutex_lock(kfd_get_dbgmgr_mutex());
-
- status = kfd_dbgmgr_unregister(dev->dbgmgr, p);
- if (!status) {
- kfd_dbgmgr_destroy(dev->dbgmgr);
- dev->dbgmgr = NULL;
- }
-
- mutex_unlock(kfd_get_dbgmgr_mutex());
-
- return status;
+ return -EPERM;
}
-/*
- * Parse and generate variable size data structure for address watch.
- * Total size of the buffer and # watch points is limited in order
- * to prevent kernel abuse. (no bearing to the much smaller HW limitation
- * which is enforced by dbgdev module)
- * please also note that the watch address itself are not "copied from user",
- * since it be set into the HW in user mode values.
- *
- */
static int kfd_ioctl_dbg_address_watch(struct file *filep,
struct kfd_process *p, void *data)
{
- struct kfd_ioctl_dbg_address_watch_args *args = data;
- struct kfd_dev *dev;
- struct dbg_address_watch_info aw_info;
- unsigned char *args_buff;
- long status;
- void __user *cmd_from_user;
- uint64_t watch_mask_value = 0;
- unsigned int args_idx = 0;
-
- memset((void *) &aw_info, 0, sizeof(struct dbg_address_watch_info));
-
- dev = kfd_device_by_id(args->gpu_id);
- if (!dev)
- return -EINVAL;
-
- if (dev->device_info->asic_family == CHIP_CARRIZO) {
- pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n");
- return -EINVAL;
- }
-
- cmd_from_user = (void __user *) args->content_ptr;
-
- /* Validate arguments */
-
- if ((args->buf_size_in_bytes > MAX_ALLOWED_AW_BUFF_SIZE) ||
- (args->buf_size_in_bytes <= sizeof(*args) + sizeof(int) * 2) ||
- (cmd_from_user == NULL))
- return -EINVAL;
-
- /* this is the actual buffer to work with */
- args_buff = memdup_user(cmd_from_user,
- args->buf_size_in_bytes - sizeof(*args));
- if (IS_ERR(args_buff))
- return PTR_ERR(args_buff);
-
- aw_info.process = p;
-
- aw_info.num_watch_points = *((uint32_t *)(&args_buff[args_idx]));
- args_idx += sizeof(aw_info.num_watch_points);
-
- aw_info.watch_mode = (enum HSA_DBG_WATCH_MODE *) &args_buff[args_idx];
- args_idx += sizeof(enum HSA_DBG_WATCH_MODE) * aw_info.num_watch_points;
-
- /*
- * set watch address base pointer to point on the array base
- * within args_buff
- */
- aw_info.watch_address = (uint64_t *) &args_buff[args_idx];
-
- /* skip over the addresses buffer */
- args_idx += sizeof(aw_info.watch_address) * aw_info.num_watch_points;
-
- if (args_idx >= args->buf_size_in_bytes - sizeof(*args)) {
- status = -EINVAL;
- goto out;
- }
-
- watch_mask_value = (uint64_t) args_buff[args_idx];
-
- if (watch_mask_value > 0) {
- /*
- * There is an array of masks.
- * set watch mask base pointer to point on the array base
- * within args_buff
- */
- aw_info.watch_mask = (uint64_t *) &args_buff[args_idx];
-
- /* skip over the masks buffer */
- args_idx += sizeof(aw_info.watch_mask) *
- aw_info.num_watch_points;
- } else {
- /* just the NULL mask, set to NULL and skip over it */
- aw_info.watch_mask = NULL;
- args_idx += sizeof(aw_info.watch_mask);
- }
-
- if (args_idx >= args->buf_size_in_bytes - sizeof(args)) {
- status = -EINVAL;
- goto out;
- }
-
- /* Currently HSA Event is not supported for DBG */
- aw_info.watch_event = NULL;
-
- mutex_lock(kfd_get_dbgmgr_mutex());
-
- status = kfd_dbgmgr_address_watch(dev->dbgmgr, &aw_info);
-
- mutex_unlock(kfd_get_dbgmgr_mutex());
-
-out:
- kfree(args_buff);
-
- return status;
+ return -EPERM;
}
/* Parse and generate fixed size data structure for wave control */
static int kfd_ioctl_dbg_wave_control(struct file *filep,
struct kfd_process *p, void *data)
{
- struct kfd_ioctl_dbg_wave_control_args *args = data;
- struct kfd_dev *dev;
- struct dbg_wave_control_info wac_info;
- unsigned char *args_buff;
- uint32_t computed_buff_size;
- long status;
- void __user *cmd_from_user;
- unsigned int args_idx = 0;
-
- memset((void *) &wac_info, 0, sizeof(struct dbg_wave_control_info));
-
- /* we use compact form, independent of the packing attribute value */
- computed_buff_size = sizeof(*args) +
- sizeof(wac_info.mode) +
- sizeof(wac_info.operand) +
- sizeof(wac_info.dbgWave_msg.DbgWaveMsg) +
- sizeof(wac_info.dbgWave_msg.MemoryVA) +
- sizeof(wac_info.trapId);
-
- dev = kfd_device_by_id(args->gpu_id);
- if (!dev)
- return -EINVAL;
-
- if (dev->device_info->asic_family == CHIP_CARRIZO) {
- pr_debug("kfd_ioctl_dbg_wave_control not supported on CZ\n");
- return -EINVAL;
- }
-
- /* input size must match the computed "compact" size */
- if (args->buf_size_in_bytes != computed_buff_size) {
- pr_debug("size mismatch, computed : actual %u : %u\n",
- args->buf_size_in_bytes, computed_buff_size);
- return -EINVAL;
- }
-
- cmd_from_user = (void __user *) args->content_ptr;
-
- if (cmd_from_user == NULL)
- return -EINVAL;
-
- /* copy the entire buffer from user */
-
- args_buff = memdup_user(cmd_from_user,
- args->buf_size_in_bytes - sizeof(*args));
- if (IS_ERR(args_buff))
- return PTR_ERR(args_buff);
-
- /* move ptr to the start of the "pay-load" area */
- wac_info.process = p;
-
- wac_info.operand = *((enum HSA_DBG_WAVEOP *)(&args_buff[args_idx]));
- args_idx += sizeof(wac_info.operand);
-
- wac_info.mode = *((enum HSA_DBG_WAVEMODE *)(&args_buff[args_idx]));
- args_idx += sizeof(wac_info.mode);
-
- wac_info.trapId = *((uint32_t *)(&args_buff[args_idx]));
- args_idx += sizeof(wac_info.trapId);
-
- wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value =
- *((uint32_t *)(&args_buff[args_idx]));
- wac_info.dbgWave_msg.MemoryVA = NULL;
-
- mutex_lock(kfd_get_dbgmgr_mutex());
-
- pr_debug("Calling dbg manager process %p, operand %u, mode %u, trapId %u, message %u\n",
- wac_info.process, wac_info.operand,
- wac_info.mode, wac_info.trapId,
- wac_info.dbgWave_msg.DbgWaveMsg.WaveMsgInfoGen2.Value);
-
- status = kfd_dbgmgr_wave_control(dev->dbgmgr, &wac_info);
-
- pr_debug("Returned status of dbg manager is %ld\n", status);
-
- mutex_unlock(kfd_get_dbgmgr_mutex());
-
- kfree(args_buff);
-
- return status;
+ return -EPERM;
}
static int kfd_ioctl_get_clock_counters(struct file *filep,
struct kfd_process *p, void *data)
{
struct kfd_ioctl_get_clock_counters_args *args = data;
- struct kfd_dev *dev;
+ struct kfd_process_device *pdd;
- dev = kfd_device_by_id(args->gpu_id);
- if (dev)
+ mutex_lock(&p->mutex);
+ pdd = kfd_process_device_data_by_id(p, args->gpu_id);
+ mutex_unlock(&p->mutex);
+ if (pdd)
/* Reading GPU clock counter from KGD */
- args->gpu_clock_counter = amdgpu_amdkfd_get_gpu_clock_counter(dev->kgd);
+ args->gpu_clock_counter = amdgpu_amdkfd_get_gpu_clock_counter(pdd->dev->adev);
else
/* Node without GPU resource */
args->gpu_clock_counter = 0;
@@ -1007,57 +831,11 @@ static int kfd_ioctl_create_event(struct file *filp, struct kfd_process *p,
* through the event_page_offset field.
*/
if (args->event_page_offset) {
- struct kfd_dev *kfd;
- struct kfd_process_device *pdd;
- void *mem, *kern_addr;
- uint64_t size;
-
- kfd = kfd_device_by_id(GET_GPU_ID(args->event_page_offset));
- if (!kfd) {
- pr_err("Getting device by id failed in %s\n", __func__);
- return -EINVAL;
- }
-
mutex_lock(&p->mutex);
-
- if (p->signal_page) {
- pr_err("Event page is already set\n");
- err = -EINVAL;
- goto out_unlock;
- }
-
- pdd = kfd_bind_process_to_device(kfd, p);
- if (IS_ERR(pdd)) {
- err = PTR_ERR(pdd);
- goto out_unlock;
- }
-
- mem = kfd_process_device_translate_handle(pdd,
- GET_IDR_HANDLE(args->event_page_offset));
- if (!mem) {
- pr_err("Can't find BO, offset is 0x%llx\n",
- args->event_page_offset);
- err = -EINVAL;
- goto out_unlock;
- }
-
- err = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(kfd->kgd,
- mem, &kern_addr, &size);
- if (err) {
- pr_err("Failed to map event page to kernel\n");
- goto out_unlock;
- }
-
- err = kfd_event_page_set(p, kern_addr, size);
- if (err) {
- pr_err("Failed to set event page\n");
- amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(kfd->kgd, mem);
- goto out_unlock;
- }
-
- p->signal_handle = args->event_page_offset;
-
+ err = kfd_kmap_event_page(p, args->event_page_offset);
mutex_unlock(&p->mutex);
+ if (err)
+ return err;
}
err = kfd_event_create(filp, p, args->event_type,
@@ -1066,10 +844,7 @@ static int kfd_ioctl_create_event(struct file *filp, struct kfd_process *p,
&args->event_page_offset,
&args->event_slot_index);
- return err;
-
-out_unlock:
- mutex_unlock(&p->mutex);
+ pr_debug("Created event (id:0x%08x) (%s)\n", args->event_id, __func__);
return err;
}
@@ -1101,14 +876,11 @@ static int kfd_ioctl_wait_events(struct file *filp, struct kfd_process *p,
void *data)
{
struct kfd_ioctl_wait_events_args *args = data;
- int err;
- err = kfd_wait_on_events(p, args->num_events,
+ return kfd_wait_on_events(p, args->num_events,
(void __user *)args->events_ptr,
(args->wait_for_all != 0),
- args->timeout, &args->wait_result);
-
- return err;
+ &args->timeout, &args->wait_result);
}
static int kfd_ioctl_set_scratch_backing_va(struct file *filep,
struct kfd_process *p, void *data)
@@ -1118,11 +890,13 @@ static int kfd_ioctl_set_scratch_backing_va(struct file *filep,
struct kfd_dev *dev;
long err;
- dev = kfd_device_by_id(args->gpu_id);
- if (!dev)
- return -EINVAL;
-
mutex_lock(&p->mutex);
+ pdd = kfd_process_device_data_by_id(p, args->gpu_id);
+ if (!pdd) {
+ err = -EINVAL;
+ goto err_pdd;
+ }
+ dev = pdd->dev;
pdd = kfd_bind_process_to_device(dev, p);
if (IS_ERR(pdd)) {
@@ -1137,11 +911,12 @@ static int kfd_ioctl_set_scratch_backing_va(struct file *filep,
if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS &&
pdd->qpd.vmid != 0 && dev->kfd2kgd->set_scratch_backing_va)
dev->kfd2kgd->set_scratch_backing_va(
- dev->kgd, args->va_addr, pdd->qpd.vmid);
+ dev->adev, args->va_addr, pdd->qpd.vmid);
return 0;
bind_process_to_device_fail:
+err_pdd:
mutex_unlock(&p->mutex);
return err;
}
@@ -1150,15 +925,17 @@ static int kfd_ioctl_get_tile_config(struct file *filep,
struct kfd_process *p, void *data)
{
struct kfd_ioctl_get_tile_config_args *args = data;
- struct kfd_dev *dev;
+ struct kfd_process_device *pdd;
struct tile_config config;
int err = 0;
- dev = kfd_device_by_id(args->gpu_id);
- if (!dev)
+ mutex_lock(&p->mutex);
+ pdd = kfd_process_device_data_by_id(p, args->gpu_id);
+ mutex_unlock(&p->mutex);
+ if (!pdd)
return -EINVAL;
- amdgpu_amdkfd_get_tile_config(dev->kgd, &config);
+ amdgpu_amdkfd_get_tile_config(pdd->dev->adev, &config);
args->gb_addr_config = config.gb_addr_config;
args->num_banks = config.num_banks;
@@ -1193,40 +970,37 @@ static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p,
{
struct kfd_ioctl_acquire_vm_args *args = data;
struct kfd_process_device *pdd;
- struct kfd_dev *dev;
struct file *drm_file;
int ret;
- dev = kfd_device_by_id(args->gpu_id);
- if (!dev)
- return -EINVAL;
-
drm_file = fget(args->drm_fd);
if (!drm_file)
return -EINVAL;
mutex_lock(&p->mutex);
-
- pdd = kfd_get_process_device_data(dev, p);
+ pdd = kfd_process_device_data_by_id(p, args->gpu_id);
if (!pdd) {
ret = -EINVAL;
- goto err_unlock;
+ goto err_pdd;
}
if (pdd->drm_file) {
ret = pdd->drm_file == drm_file ? 0 : -EBUSY;
- goto err_unlock;
+ goto err_drm_file;
}
ret = kfd_process_device_init_vm(pdd, drm_file);
if (ret)
goto err_unlock;
+
/* On success, the PDD keeps the drm_file reference */
mutex_unlock(&p->mutex);
return 0;
err_unlock:
+err_pdd:
+err_drm_file:
mutex_unlock(&p->mutex);
fput(drm_file);
return ret;
@@ -1234,8 +1008,6 @@ err_unlock:
bool kfd_dev_is_large_bar(struct kfd_dev *dev)
{
- struct kfd_local_mem_info mem_info;
-
if (debug_largebar) {
pr_debug("Simulate large-bar allocation on non large-bar machine\n");
return true;
@@ -1244,13 +1016,25 @@ bool kfd_dev_is_large_bar(struct kfd_dev *dev)
if (dev->use_iommu_v2)
return false;
- amdgpu_amdkfd_get_local_mem_info(dev->kgd, &mem_info);
- if (mem_info.local_mem_size_private == 0 &&
- mem_info.local_mem_size_public > 0)
+ if (dev->local_mem_info.local_mem_size_private == 0 &&
+ dev->local_mem_info.local_mem_size_public > 0)
return true;
return false;
}
+static int kfd_ioctl_get_available_memory(struct file *filep,
+ struct kfd_process *p, void *data)
+{
+ struct kfd_ioctl_get_available_memory_args *args = data;
+ struct kfd_process_device *pdd = kfd_lock_pdd_by_id(p, args->gpu_id);
+
+ if (!pdd)
+ return -EINVAL;
+ args->available = amdgpu_amdkfd_get_available_memory(pdd->dev->adev);
+ kfd_unlock_pdd(pdd);
+ return 0;
+}
+
static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep,
struct kfd_process *p, void *data)
{
@@ -1283,19 +1067,23 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep,
}
mutex_unlock(&p->svms.lock);
#endif
- dev = kfd_device_by_id(args->gpu_id);
- if (!dev)
- return -EINVAL;
+ mutex_lock(&p->mutex);
+ pdd = kfd_process_device_data_by_id(p, args->gpu_id);
+ if (!pdd) {
+ err = -EINVAL;
+ goto err_pdd;
+ }
+
+ dev = pdd->dev;
if ((flags & KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC) &&
(flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) &&
!kfd_dev_is_large_bar(dev)) {
pr_err("Alloc host visible vram on small bar is not allowed\n");
- return -EINVAL;
+ err = -EINVAL;
+ goto err_large_bar;
}
- mutex_lock(&p->mutex);
-
pdd = kfd_bind_process_to_device(dev, p);
if (IS_ERR(pdd)) {
err = PTR_ERR(pdd);
@@ -1308,12 +1096,16 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep,
goto err_unlock;
}
offset = kfd_get_process_doorbells(pdd);
+ if (!offset) {
+ err = -ENOMEM;
+ goto err_unlock;
+ }
} else if (flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) {
if (args->size != PAGE_SIZE) {
err = -EINVAL;
goto err_unlock;
}
- offset = amdgpu_amdkfd_get_mmio_remap_phys_addr(dev->kgd);
+ offset = dev->adev->rmmio_remap.bus_addr;
if (!offset) {
err = -ENOMEM;
goto err_unlock;
@@ -1321,9 +1113,9 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep,
}
err = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
- dev->kgd, args->va_addr, args->size,
+ dev->adev, args->va_addr, args->size,
pdd->drm_priv, (struct kgd_mem **) &mem, &offset,
- flags);
+ flags, false);
if (err)
goto err_unlock;
@@ -1353,9 +1145,11 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep,
return 0;
err_free:
- amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->kgd, (struct kgd_mem *)mem,
+ amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->adev, (struct kgd_mem *)mem,
pdd->drm_priv, NULL);
err_unlock:
+err_pdd:
+err_large_bar:
mutex_unlock(&p->mutex);
return err;
}
@@ -1366,14 +1160,9 @@ static int kfd_ioctl_free_memory_of_gpu(struct file *filep,
struct kfd_ioctl_free_memory_of_gpu_args *args = data;
struct kfd_process_device *pdd;
void *mem;
- struct kfd_dev *dev;
int ret;
uint64_t size = 0;
- dev = kfd_device_by_id(GET_GPU_ID(args->handle));
- if (!dev)
- return -EINVAL;
-
mutex_lock(&p->mutex);
/*
* Safeguard to prevent user space from freeing signal BO.
@@ -1385,11 +1174,11 @@ static int kfd_ioctl_free_memory_of_gpu(struct file *filep,
goto err_unlock;
}
- pdd = kfd_get_process_device_data(dev, p);
+ pdd = kfd_process_device_data_by_id(p, GET_GPU_ID(args->handle));
if (!pdd) {
pr_err("Process device data doesn't exist\n");
ret = -EINVAL;
- goto err_unlock;
+ goto err_pdd;
}
mem = kfd_process_device_translate_handle(
@@ -1399,7 +1188,7 @@ static int kfd_ioctl_free_memory_of_gpu(struct file *filep,
goto err_unlock;
}
- ret = amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->kgd,
+ ret = amdgpu_amdkfd_gpuvm_free_memory_of_gpu(pdd->dev->adev,
(struct kgd_mem *)mem, pdd->drm_priv, &size);
/* If freeing the buffer failed, leave the handle in place for
@@ -1412,6 +1201,7 @@ static int kfd_ioctl_free_memory_of_gpu(struct file *filep,
WRITE_ONCE(pdd->vram_usage, pdd->vram_usage - size);
err_unlock:
+err_pdd:
mutex_unlock(&p->mutex);
return ret;
}
@@ -1422,15 +1212,10 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep,
struct kfd_ioctl_map_memory_to_gpu_args *args = data;
struct kfd_process_device *pdd, *peer_pdd;
void *mem;
- struct kfd_dev *dev, *peer;
+ struct kfd_dev *dev;
long err = 0;
int i;
uint32_t *devices_arr = NULL;
- bool table_freed = false;
-
- dev = kfd_device_by_id(GET_GPU_ID(args->handle));
- if (!dev)
- return -EINVAL;
if (!args->n_devices) {
pr_debug("Device IDs array empty\n");
@@ -1455,6 +1240,12 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep,
}
mutex_lock(&p->mutex);
+ pdd = kfd_process_device_data_by_id(p, GET_GPU_ID(args->handle));
+ if (!pdd) {
+ err = -EINVAL;
+ goto get_process_device_data_failed;
+ }
+ dev = pdd->dev;
pdd = kfd_bind_process_to_device(dev, p);
if (IS_ERR(pdd)) {
@@ -1470,25 +1261,33 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep,
}
for (i = args->n_success; i < args->n_devices; i++) {
- peer = kfd_device_by_id(devices_arr[i]);
- if (!peer) {
+ peer_pdd = kfd_process_device_data_by_id(p, devices_arr[i]);
+ if (!peer_pdd) {
pr_debug("Getting device by id failed for 0x%x\n",
devices_arr[i]);
err = -EINVAL;
goto get_mem_obj_from_handle_failed;
}
- peer_pdd = kfd_bind_process_to_device(peer, p);
+ peer_pdd = kfd_bind_process_to_device(peer_pdd->dev, p);
if (IS_ERR(peer_pdd)) {
err = PTR_ERR(peer_pdd);
goto get_mem_obj_from_handle_failed;
}
+
err = amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
- peer->kgd, (struct kgd_mem *)mem,
- peer_pdd->drm_priv, &table_freed);
+ peer_pdd->dev->adev, (struct kgd_mem *)mem,
+ peer_pdd->drm_priv);
if (err) {
- pr_err("Failed to map to gpu %d/%d\n",
- i, args->n_devices);
+ struct pci_dev *pdev = peer_pdd->dev->adev->pdev;
+
+ dev_err(dev->adev->dev,
+ "Failed to map peer:%04x:%02x:%02x.%d mem_domain:%d\n",
+ pci_domain_nr(pdev->bus),
+ pdev->bus->number,
+ PCI_SLOT(pdev->devfn),
+ PCI_FUNC(pdev->devfn),
+ ((struct kgd_mem *)mem)->domain);
goto map_memory_to_gpu_failed;
}
args->n_success = i+1;
@@ -1496,28 +1295,24 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep,
mutex_unlock(&p->mutex);
- err = amdgpu_amdkfd_gpuvm_sync_memory(dev->kgd, (struct kgd_mem *) mem, true);
+ err = amdgpu_amdkfd_gpuvm_sync_memory(dev->adev, (struct kgd_mem *) mem, true);
if (err) {
pr_debug("Sync memory failed, wait interrupted by user signal\n");
goto sync_memory_failed;
}
/* Flush TLBs after waiting for the page table updates to complete */
- if (table_freed) {
- for (i = 0; i < args->n_devices; i++) {
- peer = kfd_device_by_id(devices_arr[i]);
- if (WARN_ON_ONCE(!peer))
- continue;
- peer_pdd = kfd_get_process_device_data(peer, p);
- if (WARN_ON_ONCE(!peer_pdd))
- continue;
- kfd_flush_tlb(peer_pdd, TLB_FLUSH_LEGACY);
- }
+ for (i = 0; i < args->n_devices; i++) {
+ peer_pdd = kfd_process_device_data_by_id(p, devices_arr[i]);
+ if (WARN_ON_ONCE(!peer_pdd))
+ continue;
+ kfd_flush_tlb(peer_pdd, TLB_FLUSH_LEGACY);
}
kfree(devices_arr);
return err;
+get_process_device_data_failed:
bind_process_to_device_failed:
get_mem_obj_from_handle_failed:
map_memory_to_gpu_failed:
@@ -1535,14 +1330,9 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
struct kfd_ioctl_unmap_memory_from_gpu_args *args = data;
struct kfd_process_device *pdd, *peer_pdd;
void *mem;
- struct kfd_dev *dev, *peer;
long err = 0;
uint32_t *devices_arr = NULL, i;
- dev = kfd_device_by_id(GET_GPU_ID(args->handle));
- if (!dev)
- return -EINVAL;
-
if (!args->n_devices) {
pr_debug("Device IDs array empty\n");
return -EINVAL;
@@ -1566,8 +1356,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
}
mutex_lock(&p->mutex);
-
- pdd = kfd_get_process_device_data(dev, p);
+ pdd = kfd_process_device_data_by_id(p, GET_GPU_ID(args->handle));
if (!pdd) {
err = -EINVAL;
goto bind_process_to_device_failed;
@@ -1581,19 +1370,13 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
}
for (i = args->n_success; i < args->n_devices; i++) {
- peer = kfd_device_by_id(devices_arr[i]);
- if (!peer) {
- err = -EINVAL;
- goto get_mem_obj_from_handle_failed;
- }
-
- peer_pdd = kfd_get_process_device_data(peer, p);
+ peer_pdd = kfd_process_device_data_by_id(p, devices_arr[i]);
if (!peer_pdd) {
- err = -ENODEV;
+ err = -EINVAL;
goto get_mem_obj_from_handle_failed;
}
err = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
- peer->kgd, (struct kgd_mem *)mem, peer_pdd->drm_priv);
+ peer_pdd->dev->adev, (struct kgd_mem *)mem, peer_pdd->drm_priv);
if (err) {
pr_err("Failed to unmap from gpu %d/%d\n",
i, args->n_devices);
@@ -1603,8 +1386,8 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
}
mutex_unlock(&p->mutex);
- if (dev->device_info->asic_family == CHIP_ALDEBARAN) {
- err = amdgpu_amdkfd_gpuvm_sync_memory(dev->kgd,
+ if (kfd_flush_tlb_after_unmap(pdd->dev)) {
+ err = amdgpu_amdkfd_gpuvm_sync_memory(pdd->dev->adev,
(struct kgd_mem *) mem, true);
if (err) {
pr_debug("Sync memory failed, wait interrupted by user signal\n");
@@ -1613,10 +1396,7 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
/* Flush TLBs after waiting for the page table updates to complete */
for (i = 0; i < args->n_devices; i++) {
- peer = kfd_device_by_id(devices_arr[i]);
- if (WARN_ON_ONCE(!peer))
- continue;
- peer_pdd = kfd_get_process_device_data(peer, p);
+ peer_pdd = kfd_process_device_data_by_id(p, devices_arr[i]);
if (WARN_ON_ONCE(!peer_pdd))
continue;
kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT);
@@ -1680,7 +1460,7 @@ static int kfd_ioctl_get_dmabuf_info(struct file *filep,
{
struct kfd_ioctl_get_dmabuf_info_args *args = data;
struct kfd_dev *dev = NULL;
- struct kgd_dev *dma_buf_kgd;
+ struct amdgpu_device *dmabuf_adev;
void *metadata_buffer = NULL;
uint32_t flags;
unsigned int i;
@@ -1700,15 +1480,15 @@ static int kfd_ioctl_get_dmabuf_info(struct file *filep,
}
/* Get dmabuf info from KGD */
- r = amdgpu_amdkfd_get_dmabuf_info(dev->kgd, args->dmabuf_fd,
- &dma_buf_kgd, &args->size,
+ r = amdgpu_amdkfd_get_dmabuf_info(dev->adev, args->dmabuf_fd,
+ &dmabuf_adev, &args->size,
metadata_buffer, args->metadata_size,
&args->metadata_size, &flags);
if (r)
goto exit;
/* Reverse-lookup gpu_id from kgd pointer */
- dev = kfd_device_by_kgd(dma_buf_kgd);
+ dev = kfd_device_by_adev(dmabuf_adev);
if (!dev) {
r = -EINVAL;
goto exit;
@@ -1736,29 +1516,29 @@ static int kfd_ioctl_import_dmabuf(struct file *filep,
struct kfd_ioctl_import_dmabuf_args *args = data;
struct kfd_process_device *pdd;
struct dma_buf *dmabuf;
- struct kfd_dev *dev;
int idr_handle;
uint64_t size;
void *mem;
int r;
- dev = kfd_device_by_id(args->gpu_id);
- if (!dev)
- return -EINVAL;
-
dmabuf = dma_buf_get(args->dmabuf_fd);
if (IS_ERR(dmabuf))
return PTR_ERR(dmabuf);
mutex_lock(&p->mutex);
+ pdd = kfd_process_device_data_by_id(p, args->gpu_id);
+ if (!pdd) {
+ r = -EINVAL;
+ goto err_unlock;
+ }
- pdd = kfd_bind_process_to_device(dev, p);
+ pdd = kfd_bind_process_to_device(pdd->dev, p);
if (IS_ERR(pdd)) {
r = PTR_ERR(pdd);
goto err_unlock;
}
- r = amdgpu_amdkfd_gpuvm_import_dmabuf(dev->kgd, dmabuf,
+ r = amdgpu_amdkfd_gpuvm_import_dmabuf(pdd->dev->adev, dmabuf,
args->va_addr, pdd->drm_priv,
(struct kgd_mem **)&mem, &size,
NULL);
@@ -1779,7 +1559,7 @@ static int kfd_ioctl_import_dmabuf(struct file *filep,
return 0;
err_free:
- amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->kgd, (struct kgd_mem *)mem,
+ amdgpu_amdkfd_gpuvm_free_memory_of_gpu(pdd->dev->adev, (struct kgd_mem *)mem,
pdd->drm_priv, NULL);
err_unlock:
mutex_unlock(&p->mutex);
@@ -1792,15 +1572,20 @@ static int kfd_ioctl_smi_events(struct file *filep,
struct kfd_process *p, void *data)
{
struct kfd_ioctl_smi_events_args *args = data;
- struct kfd_dev *dev;
+ struct kfd_process_device *pdd;
- dev = kfd_device_by_id(args->gpuid);
- if (!dev)
+ mutex_lock(&p->mutex);
+
+ pdd = kfd_process_device_data_by_id(p, args->gpuid);
+ mutex_unlock(&p->mutex);
+ if (!pdd)
return -EINVAL;
- return kfd_smi_event_open(dev, &args->anon_fd);
+ return kfd_smi_event_open(pdd->dev, &args->anon_fd);
}
+#if IS_ENABLED(CONFIG_HSA_AMD_SVM)
+
static int kfd_ioctl_set_xnack_mode(struct file *filep,
struct kfd_process *p, void *data)
{
@@ -1811,22 +1596,29 @@ static int kfd_ioctl_set_xnack_mode(struct file *filep,
if (args->xnack_enabled >= 0) {
if (!list_empty(&p->pqm.queues)) {
pr_debug("Process has user queues running\n");
- mutex_unlock(&p->mutex);
- return -EBUSY;
+ r = -EBUSY;
+ goto out_unlock;
}
- if (args->xnack_enabled && !kfd_process_xnack_mode(p, true))
+
+ if (p->xnack_enabled == args->xnack_enabled)
+ goto out_unlock;
+
+ if (args->xnack_enabled && !kfd_process_xnack_mode(p, true)) {
r = -EPERM;
- else
- p->xnack_enabled = args->xnack_enabled;
+ goto out_unlock;
+ }
+
+ r = svm_range_switch_xnack_reserve_mem(p, args->xnack_enabled);
} else {
args->xnack_enabled = p->xnack_enabled;
}
+
+out_unlock:
mutex_unlock(&p->mutex);
return r;
}
-#if IS_ENABLED(CONFIG_HSA_AMD_SVM)
static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, void *data)
{
struct kfd_ioctl_svm_args *args = data;
@@ -1840,22 +1632,1014 @@ static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, void *data)
if (!args->start_addr || !args->size)
return -EINVAL;
- mutex_lock(&p->mutex);
-
r = svm_ioctl(p, args->op, args->start_addr, args->size, args->nattr,
args->attrs);
- mutex_unlock(&p->mutex);
-
return r;
}
#else
+static int kfd_ioctl_set_xnack_mode(struct file *filep,
+ struct kfd_process *p, void *data)
+{
+ return -EPERM;
+}
static int kfd_ioctl_svm(struct file *filep, struct kfd_process *p, void *data)
{
return -EPERM;
}
#endif
+static int criu_checkpoint_process(struct kfd_process *p,
+ uint8_t __user *user_priv_data,
+ uint64_t *priv_offset)
+{
+ struct kfd_criu_process_priv_data process_priv;
+ int ret;
+
+ memset(&process_priv, 0, sizeof(process_priv));
+
+ process_priv.version = KFD_CRIU_PRIV_VERSION;
+ /* For CR, we don't consider negative xnack mode which is used for
+ * querying without changing it, here 0 simply means disabled and 1
+ * means enabled so retry for finding a valid PTE.
+ */
+ process_priv.xnack_mode = p->xnack_enabled ? 1 : 0;
+
+ ret = copy_to_user(user_priv_data + *priv_offset,
+ &process_priv, sizeof(process_priv));
+
+ if (ret) {
+ pr_err("Failed to copy process information to user\n");
+ ret = -EFAULT;
+ }
+
+ *priv_offset += sizeof(process_priv);
+ return ret;
+}
+
+static int criu_checkpoint_devices(struct kfd_process *p,
+ uint32_t num_devices,
+ uint8_t __user *user_addr,
+ uint8_t __user *user_priv_data,
+ uint64_t *priv_offset)
+{
+ struct kfd_criu_device_priv_data *device_priv = NULL;
+ struct kfd_criu_device_bucket *device_buckets = NULL;
+ int ret = 0, i;
+
+ device_buckets = kvzalloc(num_devices * sizeof(*device_buckets), GFP_KERNEL);
+ if (!device_buckets) {
+ ret = -ENOMEM;
+ goto exit;
+ }
+
+ device_priv = kvzalloc(num_devices * sizeof(*device_priv), GFP_KERNEL);
+ if (!device_priv) {
+ ret = -ENOMEM;
+ goto exit;
+ }
+
+ for (i = 0; i < num_devices; i++) {
+ struct kfd_process_device *pdd = p->pdds[i];
+
+ device_buckets[i].user_gpu_id = pdd->user_gpu_id;
+ device_buckets[i].actual_gpu_id = pdd->dev->id;
+
+ /*
+ * priv_data does not contain useful information for now and is reserved for
+ * future use, so we do not set its contents.
+ */
+ }
+
+ ret = copy_to_user(user_addr, device_buckets, num_devices * sizeof(*device_buckets));
+ if (ret) {
+ pr_err("Failed to copy device information to user\n");
+ ret = -EFAULT;
+ goto exit;
+ }
+
+ ret = copy_to_user(user_priv_data + *priv_offset,
+ device_priv,
+ num_devices * sizeof(*device_priv));
+ if (ret) {
+ pr_err("Failed to copy device information to user\n");
+ ret = -EFAULT;
+ }
+ *priv_offset += num_devices * sizeof(*device_priv);
+
+exit:
+ kvfree(device_buckets);
+ kvfree(device_priv);
+ return ret;
+}
+
+static uint32_t get_process_num_bos(struct kfd_process *p)
+{
+ uint32_t num_of_bos = 0;
+ int i;
+
+ /* Run over all PDDs of the process */
+ for (i = 0; i < p->n_pdds; i++) {
+ struct kfd_process_device *pdd = p->pdds[i];
+ void *mem;
+ int id;
+
+ idr_for_each_entry(&pdd->alloc_idr, mem, id) {
+ struct kgd_mem *kgd_mem = (struct kgd_mem *)mem;
+
+ if ((uint64_t)kgd_mem->va > pdd->gpuvm_base)
+ num_of_bos++;
+ }
+ }
+ return num_of_bos;
+}
+
+static int criu_get_prime_handle(struct drm_gem_object *gobj, int flags,
+ u32 *shared_fd)
+{
+ struct dma_buf *dmabuf;
+ int ret;
+
+ dmabuf = amdgpu_gem_prime_export(gobj, flags);
+ if (IS_ERR(dmabuf)) {
+ ret = PTR_ERR(dmabuf);
+ pr_err("dmabuf export failed for the BO\n");
+ return ret;
+ }
+
+ ret = dma_buf_fd(dmabuf, flags);
+ if (ret < 0) {
+ pr_err("dmabuf create fd failed, ret:%d\n", ret);
+ goto out_free_dmabuf;
+ }
+
+ *shared_fd = ret;
+ return 0;
+
+out_free_dmabuf:
+ dma_buf_put(dmabuf);
+ return ret;
+}
+
+static int criu_checkpoint_bos(struct kfd_process *p,
+ uint32_t num_bos,
+ uint8_t __user *user_bos,
+ uint8_t __user *user_priv_data,
+ uint64_t *priv_offset)
+{
+ struct kfd_criu_bo_bucket *bo_buckets;
+ struct kfd_criu_bo_priv_data *bo_privs;
+ int ret = 0, pdd_index, bo_index = 0, id;
+ void *mem;
+
+ bo_buckets = kvzalloc(num_bos * sizeof(*bo_buckets), GFP_KERNEL);
+ if (!bo_buckets)
+ return -ENOMEM;
+
+ bo_privs = kvzalloc(num_bos * sizeof(*bo_privs), GFP_KERNEL);
+ if (!bo_privs) {
+ ret = -ENOMEM;
+ goto exit;
+ }
+
+ for (pdd_index = 0; pdd_index < p->n_pdds; pdd_index++) {
+ struct kfd_process_device *pdd = p->pdds[pdd_index];
+ struct amdgpu_bo *dumper_bo;
+ struct kgd_mem *kgd_mem;
+
+ idr_for_each_entry(&pdd->alloc_idr, mem, id) {
+ struct kfd_criu_bo_bucket *bo_bucket;
+ struct kfd_criu_bo_priv_data *bo_priv;
+ int i, dev_idx = 0;
+
+ if (!mem) {
+ ret = -ENOMEM;
+ goto exit;
+ }
+
+ kgd_mem = (struct kgd_mem *)mem;
+ dumper_bo = kgd_mem->bo;
+
+ if ((uint64_t)kgd_mem->va <= pdd->gpuvm_base)
+ continue;
+
+ bo_bucket = &bo_buckets[bo_index];
+ bo_priv = &bo_privs[bo_index];
+
+ bo_bucket->gpu_id = pdd->user_gpu_id;
+ bo_bucket->addr = (uint64_t)kgd_mem->va;
+ bo_bucket->size = amdgpu_bo_size(dumper_bo);
+ bo_bucket->alloc_flags = (uint32_t)kgd_mem->alloc_flags;
+ bo_priv->idr_handle = id;
+
+ if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
+ ret = amdgpu_ttm_tt_get_userptr(&dumper_bo->tbo,
+ &bo_priv->user_addr);
+ if (ret) {
+ pr_err("Failed to obtain user address for user-pointer bo\n");
+ goto exit;
+ }
+ }
+ if (bo_bucket->alloc_flags
+ & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) {
+ ret = criu_get_prime_handle(&dumper_bo->tbo.base,
+ bo_bucket->alloc_flags &
+ KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE ? DRM_RDWR : 0,
+ &bo_bucket->dmabuf_fd);
+ if (ret)
+ goto exit;
+ } else {
+ bo_bucket->dmabuf_fd = KFD_INVALID_FD;
+ }
+
+ if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL)
+ bo_bucket->offset = KFD_MMAP_TYPE_DOORBELL |
+ KFD_MMAP_GPU_ID(pdd->dev->id);
+ else if (bo_bucket->alloc_flags &
+ KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP)
+ bo_bucket->offset = KFD_MMAP_TYPE_MMIO |
+ KFD_MMAP_GPU_ID(pdd->dev->id);
+ else
+ bo_bucket->offset = amdgpu_bo_mmap_offset(dumper_bo);
+
+ for (i = 0; i < p->n_pdds; i++) {
+ if (amdgpu_amdkfd_bo_mapped_to_dev(p->pdds[i]->dev->adev, kgd_mem))
+ bo_priv->mapped_gpuids[dev_idx++] = p->pdds[i]->user_gpu_id;
+ }
+
+ pr_debug("bo_size = 0x%llx, bo_addr = 0x%llx bo_offset = 0x%llx\n"
+ "gpu_id = 0x%x alloc_flags = 0x%x idr_handle = 0x%x",
+ bo_bucket->size,
+ bo_bucket->addr,
+ bo_bucket->offset,
+ bo_bucket->gpu_id,
+ bo_bucket->alloc_flags,
+ bo_priv->idr_handle);
+ bo_index++;
+ }
+ }
+
+ ret = copy_to_user(user_bos, bo_buckets, num_bos * sizeof(*bo_buckets));
+ if (ret) {
+ pr_err("Failed to copy BO information to user\n");
+ ret = -EFAULT;
+ goto exit;
+ }
+
+ ret = copy_to_user(user_priv_data + *priv_offset, bo_privs, num_bos * sizeof(*bo_privs));
+ if (ret) {
+ pr_err("Failed to copy BO priv information to user\n");
+ ret = -EFAULT;
+ goto exit;
+ }
+
+ *priv_offset += num_bos * sizeof(*bo_privs);
+
+exit:
+ while (ret && bo_index--) {
+ if (bo_buckets[bo_index].alloc_flags
+ & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT))
+ close_fd(bo_buckets[bo_index].dmabuf_fd);
+ }
+
+ kvfree(bo_buckets);
+ kvfree(bo_privs);
+ return ret;
+}
+
+static int criu_get_process_object_info(struct kfd_process *p,
+ uint32_t *num_devices,
+ uint32_t *num_bos,
+ uint32_t *num_objects,
+ uint64_t *objs_priv_size)
+{
+ uint64_t queues_priv_data_size, svm_priv_data_size, priv_size;
+ uint32_t num_queues, num_events, num_svm_ranges;
+ int ret;
+
+ *num_devices = p->n_pdds;
+ *num_bos = get_process_num_bos(p);
+
+ ret = kfd_process_get_queue_info(p, &num_queues, &queues_priv_data_size);
+ if (ret)
+ return ret;
+
+ num_events = kfd_get_num_events(p);
+
+ ret = svm_range_get_info(p, &num_svm_ranges, &svm_priv_data_size);
+ if (ret)
+ return ret;
+
+ *num_objects = num_queues + num_events + num_svm_ranges;
+
+ if (objs_priv_size) {
+ priv_size = sizeof(struct kfd_criu_process_priv_data);
+ priv_size += *num_devices * sizeof(struct kfd_criu_device_priv_data);
+ priv_size += *num_bos * sizeof(struct kfd_criu_bo_priv_data);
+ priv_size += queues_priv_data_size;
+ priv_size += num_events * sizeof(struct kfd_criu_event_priv_data);
+ priv_size += svm_priv_data_size;
+ *objs_priv_size = priv_size;
+ }
+ return 0;
+}
+
+static int criu_checkpoint(struct file *filep,
+ struct kfd_process *p,
+ struct kfd_ioctl_criu_args *args)
+{
+ int ret;
+ uint32_t num_devices, num_bos, num_objects;
+ uint64_t priv_size, priv_offset = 0, bo_priv_offset;
+
+ if (!args->devices || !args->bos || !args->priv_data)
+ return -EINVAL;
+
+ mutex_lock(&p->mutex);
+
+ if (!p->n_pdds) {
+ pr_err("No pdd for given process\n");
+ ret = -ENODEV;
+ goto exit_unlock;
+ }
+
+ /* Confirm all process queues are evicted */
+ if (!p->queues_paused) {
+ pr_err("Cannot dump process when queues are not in evicted state\n");
+ /* CRIU plugin did not call op PROCESS_INFO before checkpointing */
+ ret = -EINVAL;
+ goto exit_unlock;
+ }
+
+ ret = criu_get_process_object_info(p, &num_devices, &num_bos, &num_objects, &priv_size);
+ if (ret)
+ goto exit_unlock;
+
+ if (num_devices != args->num_devices ||
+ num_bos != args->num_bos ||
+ num_objects != args->num_objects ||
+ priv_size != args->priv_data_size) {
+
+ ret = -EINVAL;
+ goto exit_unlock;
+ }
+
+ /* each function will store private data inside priv_data and adjust priv_offset */
+ ret = criu_checkpoint_process(p, (uint8_t __user *)args->priv_data, &priv_offset);
+ if (ret)
+ goto exit_unlock;
+
+ ret = criu_checkpoint_devices(p, num_devices, (uint8_t __user *)args->devices,
+ (uint8_t __user *)args->priv_data, &priv_offset);
+ if (ret)
+ goto exit_unlock;
+
+ /* Leave room for BOs in the private data. They need to be restored
+ * before events, but we checkpoint them last to simplify the error
+ * handling.
+ */
+ bo_priv_offset = priv_offset;
+ priv_offset += num_bos * sizeof(struct kfd_criu_bo_priv_data);
+
+ if (num_objects) {
+ ret = kfd_criu_checkpoint_queues(p, (uint8_t __user *)args->priv_data,
+ &priv_offset);
+ if (ret)
+ goto exit_unlock;
+
+ ret = kfd_criu_checkpoint_events(p, (uint8_t __user *)args->priv_data,
+ &priv_offset);
+ if (ret)
+ goto exit_unlock;
+
+ ret = kfd_criu_checkpoint_svm(p, (uint8_t __user *)args->priv_data, &priv_offset);
+ if (ret)
+ goto exit_unlock;
+ }
+
+ /* This must be the last thing in this function that can fail.
+ * Otherwise we leak dmabuf file descriptors.
+ */
+ ret = criu_checkpoint_bos(p, num_bos, (uint8_t __user *)args->bos,
+ (uint8_t __user *)args->priv_data, &bo_priv_offset);
+
+exit_unlock:
+ mutex_unlock(&p->mutex);
+ if (ret)
+ pr_err("Failed to dump CRIU ret:%d\n", ret);
+ else
+ pr_debug("CRIU dump ret:%d\n", ret);
+
+ return ret;
+}
+
+static int criu_restore_process(struct kfd_process *p,
+ struct kfd_ioctl_criu_args *args,
+ uint64_t *priv_offset,
+ uint64_t max_priv_data_size)
+{
+ int ret = 0;
+ struct kfd_criu_process_priv_data process_priv;
+
+ if (*priv_offset + sizeof(process_priv) > max_priv_data_size)
+ return -EINVAL;
+
+ ret = copy_from_user(&process_priv,
+ (void __user *)(args->priv_data + *priv_offset),
+ sizeof(process_priv));
+ if (ret) {
+ pr_err("Failed to copy process private information from user\n");
+ ret = -EFAULT;
+ goto exit;
+ }
+ *priv_offset += sizeof(process_priv);
+
+ if (process_priv.version != KFD_CRIU_PRIV_VERSION) {
+ pr_err("Invalid CRIU API version (checkpointed:%d current:%d)\n",
+ process_priv.version, KFD_CRIU_PRIV_VERSION);
+ return -EINVAL;
+ }
+
+ pr_debug("Setting XNACK mode\n");
+ if (process_priv.xnack_mode && !kfd_process_xnack_mode(p, true)) {
+ pr_err("xnack mode cannot be set\n");
+ ret = -EPERM;
+ goto exit;
+ } else {
+ pr_debug("set xnack mode: %d\n", process_priv.xnack_mode);
+ p->xnack_enabled = process_priv.xnack_mode;
+ }
+
+exit:
+ return ret;
+}
+
+static int criu_restore_devices(struct kfd_process *p,
+ struct kfd_ioctl_criu_args *args,
+ uint64_t *priv_offset,
+ uint64_t max_priv_data_size)
+{
+ struct kfd_criu_device_bucket *device_buckets;
+ struct kfd_criu_device_priv_data *device_privs;
+ int ret = 0;
+ uint32_t i;
+
+ if (args->num_devices != p->n_pdds)
+ return -EINVAL;
+
+ if (*priv_offset + (args->num_devices * sizeof(*device_privs)) > max_priv_data_size)
+ return -EINVAL;
+
+ device_buckets = kmalloc_array(args->num_devices, sizeof(*device_buckets), GFP_KERNEL);
+ if (!device_buckets)
+ return -ENOMEM;
+
+ ret = copy_from_user(device_buckets, (void __user *)args->devices,
+ args->num_devices * sizeof(*device_buckets));
+ if (ret) {
+ pr_err("Failed to copy devices buckets from user\n");
+ ret = -EFAULT;
+ goto exit;
+ }
+
+ for (i = 0; i < args->num_devices; i++) {
+ struct kfd_dev *dev;
+ struct kfd_process_device *pdd;
+ struct file *drm_file;
+
+ /* device private data is not currently used */
+
+ if (!device_buckets[i].user_gpu_id) {
+ pr_err("Invalid user gpu_id\n");
+ ret = -EINVAL;
+ goto exit;
+ }
+
+ dev = kfd_device_by_id(device_buckets[i].actual_gpu_id);
+ if (!dev) {
+ pr_err("Failed to find device with gpu_id = %x\n",
+ device_buckets[i].actual_gpu_id);
+ ret = -EINVAL;
+ goto exit;
+ }
+
+ pdd = kfd_get_process_device_data(dev, p);
+ if (!pdd) {
+ pr_err("Failed to get pdd for gpu_id = %x\n",
+ device_buckets[i].actual_gpu_id);
+ ret = -EINVAL;
+ goto exit;
+ }
+ pdd->user_gpu_id = device_buckets[i].user_gpu_id;
+
+ drm_file = fget(device_buckets[i].drm_fd);
+ if (!drm_file) {
+ pr_err("Invalid render node file descriptor sent from plugin (%d)\n",
+ device_buckets[i].drm_fd);
+ ret = -EINVAL;
+ goto exit;
+ }
+
+ if (pdd->drm_file) {
+ ret = -EINVAL;
+ goto exit;
+ }
+
+ /* create the vm using render nodes for kfd pdd */
+ if (kfd_process_device_init_vm(pdd, drm_file)) {
+ pr_err("could not init vm for given pdd\n");
+ /* On success, the PDD keeps the drm_file reference */
+ fput(drm_file);
+ ret = -EINVAL;
+ goto exit;
+ }
+ /*
+ * pdd now already has the vm bound to render node so below api won't create a new
+ * exclusive kfd mapping but use existing one with renderDXXX but is still needed
+ * for iommu v2 binding and runtime pm.
+ */
+ pdd = kfd_bind_process_to_device(dev, p);
+ if (IS_ERR(pdd)) {
+ ret = PTR_ERR(pdd);
+ goto exit;
+ }
+
+ if (!pdd->doorbell_index &&
+ kfd_alloc_process_doorbells(pdd->dev, &pdd->doorbell_index) < 0) {
+ ret = -ENOMEM;
+ goto exit;
+ }
+ }
+
+ /*
+ * We are not copying device private data from user as we are not using the data for now,
+ * but we still adjust for its private data.
+ */
+ *priv_offset += args->num_devices * sizeof(*device_privs);
+
+exit:
+ kfree(device_buckets);
+ return ret;
+}
+
+static int criu_restore_memory_of_gpu(struct kfd_process_device *pdd,
+ struct kfd_criu_bo_bucket *bo_bucket,
+ struct kfd_criu_bo_priv_data *bo_priv,
+ struct kgd_mem **kgd_mem)
+{
+ int idr_handle;
+ int ret;
+ const bool criu_resume = true;
+ u64 offset;
+
+ if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL) {
+ if (bo_bucket->size != kfd_doorbell_process_slice(pdd->dev))
+ return -EINVAL;
+
+ offset = kfd_get_process_doorbells(pdd);
+ if (!offset)
+ return -ENOMEM;
+ } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) {
+ /* MMIO BOs need remapped bus address */
+ if (bo_bucket->size != PAGE_SIZE) {
+ pr_err("Invalid page size\n");
+ return -EINVAL;
+ }
+ offset = pdd->dev->adev->rmmio_remap.bus_addr;
+ if (!offset) {
+ pr_err("amdgpu_amdkfd_get_mmio_remap_phys_addr failed\n");
+ return -ENOMEM;
+ }
+ } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_USERPTR) {
+ offset = bo_priv->user_addr;
+ }
+ /* Create the BO */
+ ret = amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(pdd->dev->adev, bo_bucket->addr,
+ bo_bucket->size, pdd->drm_priv, kgd_mem,
+ &offset, bo_bucket->alloc_flags, criu_resume);
+ if (ret) {
+ pr_err("Could not create the BO\n");
+ return ret;
+ }
+ pr_debug("New BO created: size:0x%llx addr:0x%llx offset:0x%llx\n",
+ bo_bucket->size, bo_bucket->addr, offset);
+
+ /* Restore previous IDR handle */
+ pr_debug("Restoring old IDR handle for the BO");
+ idr_handle = idr_alloc(&pdd->alloc_idr, *kgd_mem, bo_priv->idr_handle,
+ bo_priv->idr_handle + 1, GFP_KERNEL);
+
+ if (idr_handle < 0) {
+ pr_err("Could not allocate idr\n");
+ amdgpu_amdkfd_gpuvm_free_memory_of_gpu(pdd->dev->adev, *kgd_mem, pdd->drm_priv,
+ NULL);
+ return -ENOMEM;
+ }
+
+ if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_DOORBELL)
+ bo_bucket->restored_offset = KFD_MMAP_TYPE_DOORBELL | KFD_MMAP_GPU_ID(pdd->dev->id);
+ if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_MMIO_REMAP) {
+ bo_bucket->restored_offset = KFD_MMAP_TYPE_MMIO | KFD_MMAP_GPU_ID(pdd->dev->id);
+ } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_GTT) {
+ bo_bucket->restored_offset = offset;
+ } else if (bo_bucket->alloc_flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM) {
+ bo_bucket->restored_offset = offset;
+ /* Update the VRAM usage count */
+ WRITE_ONCE(pdd->vram_usage, pdd->vram_usage + bo_bucket->size);
+ }
+ return 0;
+}
+
+static int criu_restore_bo(struct kfd_process *p,
+ struct kfd_criu_bo_bucket *bo_bucket,
+ struct kfd_criu_bo_priv_data *bo_priv)
+{
+ struct kfd_process_device *pdd;
+ struct kgd_mem *kgd_mem;
+ int ret;
+ int j;
+
+ pr_debug("Restoring BO size:0x%llx addr:0x%llx gpu_id:0x%x flags:0x%x idr_handle:0x%x\n",
+ bo_bucket->size, bo_bucket->addr, bo_bucket->gpu_id, bo_bucket->alloc_flags,
+ bo_priv->idr_handle);
+
+ pdd = kfd_process_device_data_by_id(p, bo_bucket->gpu_id);
+ if (!pdd) {
+ pr_err("Failed to get pdd\n");
+ return -ENODEV;
+ }
+
+ ret = criu_restore_memory_of_gpu(pdd, bo_bucket, bo_priv, &kgd_mem);
+ if (ret)
+ return ret;
+
+ /* now map these BOs to GPU/s */
+ for (j = 0; j < p->n_pdds; j++) {
+ struct kfd_dev *peer;
+ struct kfd_process_device *peer_pdd;
+
+ if (!bo_priv->mapped_gpuids[j])
+ break;
+
+ peer_pdd = kfd_process_device_data_by_id(p, bo_priv->mapped_gpuids[j]);
+ if (!peer_pdd)
+ return -EINVAL;
+
+ peer = peer_pdd->dev;
+
+ peer_pdd = kfd_bind_process_to_device(peer, p);
+ if (IS_ERR(peer_pdd))
+ return PTR_ERR(peer_pdd);
+
+ ret = amdgpu_amdkfd_gpuvm_map_memory_to_gpu(peer->adev, kgd_mem,
+ peer_pdd->drm_priv);
+ if (ret) {
+ pr_err("Failed to map to gpu %d/%d\n", j, p->n_pdds);
+ return ret;
+ }
+ }
+
+ pr_debug("map memory was successful for the BO\n");
+ /* create the dmabuf object and export the bo */
+ if (bo_bucket->alloc_flags
+ & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT)) {
+ ret = criu_get_prime_handle(&kgd_mem->bo->tbo.base, DRM_RDWR,
+ &bo_bucket->dmabuf_fd);
+ if (ret)
+ return ret;
+ } else {
+ bo_bucket->dmabuf_fd = KFD_INVALID_FD;
+ }
+
+ return 0;
+}
+
+static int criu_restore_bos(struct kfd_process *p,
+ struct kfd_ioctl_criu_args *args,
+ uint64_t *priv_offset,
+ uint64_t max_priv_data_size)
+{
+ struct kfd_criu_bo_bucket *bo_buckets = NULL;
+ struct kfd_criu_bo_priv_data *bo_privs = NULL;
+ int ret = 0;
+ uint32_t i = 0;
+
+ if (*priv_offset + (args->num_bos * sizeof(*bo_privs)) > max_priv_data_size)
+ return -EINVAL;
+
+ /* Prevent MMU notifications until stage-4 IOCTL (CRIU_RESUME) is received */
+ amdgpu_amdkfd_block_mmu_notifications(p->kgd_process_info);
+
+ bo_buckets = kvmalloc_array(args->num_bos, sizeof(*bo_buckets), GFP_KERNEL);
+ if (!bo_buckets)
+ return -ENOMEM;
+
+ ret = copy_from_user(bo_buckets, (void __user *)args->bos,
+ args->num_bos * sizeof(*bo_buckets));
+ if (ret) {
+ pr_err("Failed to copy BOs information from user\n");
+ ret = -EFAULT;
+ goto exit;
+ }
+
+ bo_privs = kvmalloc_array(args->num_bos, sizeof(*bo_privs), GFP_KERNEL);
+ if (!bo_privs) {
+ ret = -ENOMEM;
+ goto exit;
+ }
+
+ ret = copy_from_user(bo_privs, (void __user *)args->priv_data + *priv_offset,
+ args->num_bos * sizeof(*bo_privs));
+ if (ret) {
+ pr_err("Failed to copy BOs information from user\n");
+ ret = -EFAULT;
+ goto exit;
+ }
+ *priv_offset += args->num_bos * sizeof(*bo_privs);
+
+ /* Create and map new BOs */
+ for (; i < args->num_bos; i++) {
+ ret = criu_restore_bo(p, &bo_buckets[i], &bo_privs[i]);
+ if (ret) {
+ pr_debug("Failed to restore BO[%d] ret%d\n", i, ret);
+ goto exit;
+ }
+ } /* done */
+
+ /* Copy only the buckets back so user can read bo_buckets[N].restored_offset */
+ ret = copy_to_user((void __user *)args->bos,
+ bo_buckets,
+ (args->num_bos * sizeof(*bo_buckets)));
+ if (ret)
+ ret = -EFAULT;
+
+exit:
+ while (ret && i--) {
+ if (bo_buckets[i].alloc_flags
+ & (KFD_IOC_ALLOC_MEM_FLAGS_VRAM | KFD_IOC_ALLOC_MEM_FLAGS_GTT))
+ close_fd(bo_buckets[i].dmabuf_fd);
+ }
+ kvfree(bo_buckets);
+ kvfree(bo_privs);
+ return ret;
+}
+
+static int criu_restore_objects(struct file *filep,
+ struct kfd_process *p,
+ struct kfd_ioctl_criu_args *args,
+ uint64_t *priv_offset,
+ uint64_t max_priv_data_size)
+{
+ int ret = 0;
+ uint32_t i;
+
+ BUILD_BUG_ON(offsetof(struct kfd_criu_queue_priv_data, object_type));
+ BUILD_BUG_ON(offsetof(struct kfd_criu_event_priv_data, object_type));
+ BUILD_BUG_ON(offsetof(struct kfd_criu_svm_range_priv_data, object_type));
+
+ for (i = 0; i < args->num_objects; i++) {
+ uint32_t object_type;
+
+ if (*priv_offset + sizeof(object_type) > max_priv_data_size) {
+ pr_err("Invalid private data size\n");
+ return -EINVAL;
+ }
+
+ ret = get_user(object_type, (uint32_t __user *)(args->priv_data + *priv_offset));
+ if (ret) {
+ pr_err("Failed to copy private information from user\n");
+ goto exit;
+ }
+
+ switch (object_type) {
+ case KFD_CRIU_OBJECT_TYPE_QUEUE:
+ ret = kfd_criu_restore_queue(p, (uint8_t __user *)args->priv_data,
+ priv_offset, max_priv_data_size);
+ if (ret)
+ goto exit;
+ break;
+ case KFD_CRIU_OBJECT_TYPE_EVENT:
+ ret = kfd_criu_restore_event(filep, p, (uint8_t __user *)args->priv_data,
+ priv_offset, max_priv_data_size);
+ if (ret)
+ goto exit;
+ break;
+ case KFD_CRIU_OBJECT_TYPE_SVM_RANGE:
+ ret = kfd_criu_restore_svm(p, (uint8_t __user *)args->priv_data,
+ priv_offset, max_priv_data_size);
+ if (ret)
+ goto exit;
+ break;
+ default:
+ pr_err("Invalid object type:%u at index:%d\n", object_type, i);
+ ret = -EINVAL;
+ goto exit;
+ }
+ }
+exit:
+ return ret;
+}
+
+static int criu_restore(struct file *filep,
+ struct kfd_process *p,
+ struct kfd_ioctl_criu_args *args)
+{
+ uint64_t priv_offset = 0;
+ int ret = 0;
+
+ pr_debug("CRIU restore (num_devices:%u num_bos:%u num_objects:%u priv_data_size:%llu)\n",
+ args->num_devices, args->num_bos, args->num_objects, args->priv_data_size);
+
+ if (!args->bos || !args->devices || !args->priv_data || !args->priv_data_size ||
+ !args->num_devices || !args->num_bos)
+ return -EINVAL;
+
+ mutex_lock(&p->mutex);
+
+ /*
+ * Set the process to evicted state to avoid running any new queues before all the memory
+ * mappings are ready.
+ */
+ ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_CRIU_RESTORE);
+ if (ret)
+ goto exit_unlock;
+
+ /* Each function will adjust priv_offset based on how many bytes they consumed */
+ ret = criu_restore_process(p, args, &priv_offset, args->priv_data_size);
+ if (ret)
+ goto exit_unlock;
+
+ ret = criu_restore_devices(p, args, &priv_offset, args->priv_data_size);
+ if (ret)
+ goto exit_unlock;
+
+ ret = criu_restore_bos(p, args, &priv_offset, args->priv_data_size);
+ if (ret)
+ goto exit_unlock;
+
+ ret = criu_restore_objects(filep, p, args, &priv_offset, args->priv_data_size);
+ if (ret)
+ goto exit_unlock;
+
+ if (priv_offset != args->priv_data_size) {
+ pr_err("Invalid private data size\n");
+ ret = -EINVAL;
+ }
+
+exit_unlock:
+ mutex_unlock(&p->mutex);
+ if (ret)
+ pr_err("Failed to restore CRIU ret:%d\n", ret);
+ else
+ pr_debug("CRIU restore successful\n");
+
+ return ret;
+}
+
+static int criu_unpause(struct file *filep,
+ struct kfd_process *p,
+ struct kfd_ioctl_criu_args *args)
+{
+ int ret;
+
+ mutex_lock(&p->mutex);
+
+ if (!p->queues_paused) {
+ mutex_unlock(&p->mutex);
+ return -EINVAL;
+ }
+
+ ret = kfd_process_restore_queues(p);
+ if (ret)
+ pr_err("Failed to unpause queues ret:%d\n", ret);
+ else
+ p->queues_paused = false;
+
+ mutex_unlock(&p->mutex);
+
+ return ret;
+}
+
+static int criu_resume(struct file *filep,
+ struct kfd_process *p,
+ struct kfd_ioctl_criu_args *args)
+{
+ struct kfd_process *target = NULL;
+ struct pid *pid = NULL;
+ int ret = 0;
+
+ pr_debug("Inside %s, target pid for criu restore: %d\n", __func__,
+ args->pid);
+
+ pid = find_get_pid(args->pid);
+ if (!pid) {
+ pr_err("Cannot find pid info for %i\n", args->pid);
+ return -ESRCH;
+ }
+
+ pr_debug("calling kfd_lookup_process_by_pid\n");
+ target = kfd_lookup_process_by_pid(pid);
+
+ put_pid(pid);
+
+ if (!target) {
+ pr_debug("Cannot find process info for %i\n", args->pid);
+ return -ESRCH;
+ }
+
+ mutex_lock(&target->mutex);
+ ret = kfd_criu_resume_svm(target);
+ if (ret) {
+ pr_err("kfd_criu_resume_svm failed for %i\n", args->pid);
+ goto exit;
+ }
+
+ ret = amdgpu_amdkfd_criu_resume(target->kgd_process_info);
+ if (ret)
+ pr_err("amdgpu_amdkfd_criu_resume failed for %i\n", args->pid);
+
+exit:
+ mutex_unlock(&target->mutex);
+
+ kfd_unref_process(target);
+ return ret;
+}
+
+static int criu_process_info(struct file *filep,
+ struct kfd_process *p,
+ struct kfd_ioctl_criu_args *args)
+{
+ int ret = 0;
+
+ mutex_lock(&p->mutex);
+
+ if (!p->n_pdds) {
+ pr_err("No pdd for given process\n");
+ ret = -ENODEV;
+ goto err_unlock;
+ }
+
+ ret = kfd_process_evict_queues(p, KFD_QUEUE_EVICTION_CRIU_CHECKPOINT);
+ if (ret)
+ goto err_unlock;
+
+ p->queues_paused = true;
+
+ args->pid = task_pid_nr_ns(p->lead_thread,
+ task_active_pid_ns(p->lead_thread));
+
+ ret = criu_get_process_object_info(p, &args->num_devices, &args->num_bos,
+ &args->num_objects, &args->priv_data_size);
+ if (ret)
+ goto err_unlock;
+
+ dev_dbg(kfd_device, "Num of devices:%u bos:%u objects:%u priv_data_size:%lld\n",
+ args->num_devices, args->num_bos, args->num_objects,
+ args->priv_data_size);
+
+err_unlock:
+ if (ret) {
+ kfd_process_restore_queues(p);
+ p->queues_paused = false;
+ }
+ mutex_unlock(&p->mutex);
+ return ret;
+}
+
+static int kfd_ioctl_criu(struct file *filep, struct kfd_process *p, void *data)
+{
+ struct kfd_ioctl_criu_args *args = data;
+ int ret;
+
+ dev_dbg(kfd_device, "CRIU operation: %d\n", args->op);
+ switch (args->op) {
+ case KFD_CRIU_OP_PROCESS_INFO:
+ ret = criu_process_info(filep, p, args);
+ break;
+ case KFD_CRIU_OP_CHECKPOINT:
+ ret = criu_checkpoint(filep, p, args);
+ break;
+ case KFD_CRIU_OP_UNPAUSE:
+ ret = criu_unpause(filep, p, args);
+ break;
+ case KFD_CRIU_OP_RESTORE:
+ ret = criu_restore(filep, p, args);
+ break;
+ case KFD_CRIU_OP_RESUME:
+ ret = criu_resume(filep, p, args);
+ break;
+ default:
+ dev_dbg(kfd_device, "Unsupported CRIU operation:%d\n", args->op);
+ ret = -EINVAL;
+ break;
+ }
+
+ if (ret)
+ dev_dbg(kfd_device, "CRIU operation:%d err:%d\n", args->op, ret);
+
+ return ret;
+}
+
#define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \
[_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \
.cmd_drv = 0, .name = #ioctl}
@@ -1898,16 +2682,16 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
AMDKFD_IOCTL_DEF(AMDKFD_IOC_WAIT_EVENTS,
kfd_ioctl_wait_events, 0),
- AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_REGISTER,
+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_REGISTER_DEPRECATED,
kfd_ioctl_dbg_register, 0),
- AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_UNREGISTER,
+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_UNREGISTER_DEPRECATED,
kfd_ioctl_dbg_unregister, 0),
- AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_ADDRESS_WATCH,
+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_ADDRESS_WATCH_DEPRECATED,
kfd_ioctl_dbg_address_watch, 0),
- AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_WAVE_CONTROL,
+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_DBG_WAVE_CONTROL_DEPRECATED,
kfd_ioctl_dbg_wave_control, 0),
AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_SCRATCH_BACKING_VA,
@@ -1959,6 +2743,12 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
AMDKFD_IOCTL_DEF(AMDKFD_IOC_SET_XNACK_MODE,
kfd_ioctl_set_xnack_mode, 0),
+
+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_CRIU_OP,
+ kfd_ioctl_criu, KFD_IOC_FLAG_CHECKPOINT_RESTORE),
+
+ AMDKFD_IOCTL_DEF(AMDKFD_IOC_AVAILABLE_MEMORY,
+ kfd_ioctl_get_available_memory, 0),
};
#define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls)
@@ -1973,6 +2763,7 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
char *kdata = NULL;
unsigned int usize, asize;
int retcode = -EINVAL;
+ bool ptrace_attached = false;
if (nr >= AMDKFD_CORE_IOCTL_COUNT)
goto err_i1;
@@ -1998,7 +2789,15 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
* processes need to create their own KFD device context.
*/
process = filep->private_data;
- if (process->lead_thread != current->group_leader) {
+
+ rcu_read_lock();
+ if ((ioctl->flags & KFD_IOC_FLAG_CHECKPOINT_RESTORE) &&
+ ptrace_parent(process->lead_thread) == current)
+ ptrace_attached = true;
+ rcu_read_unlock();
+
+ if (process->lead_thread != current->group_leader
+ && !ptrace_attached) {
dev_dbg(kfd_device, "Using KFD FD in wrong process\n");
retcode = -EBADF;
goto err_i1;
@@ -2013,6 +2812,19 @@ static long kfd_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
goto err_i1;
}
+ /*
+ * Versions of docker shipped in Ubuntu 18.xx and 20.xx do not support
+ * CAP_CHECKPOINT_RESTORE, so we also allow access if CAP_SYS_ADMIN as CAP_SYS_ADMIN is a
+ * more priviledged access.
+ */
+ if (unlikely(ioctl->flags & KFD_IOC_FLAG_CHECKPOINT_RESTORE)) {
+ if (!capable(CAP_CHECKPOINT_RESTORE) &&
+ !capable(CAP_SYS_ADMIN)) {
+ retcode = -EACCES;
+ goto err_i1;
+ }
+ }
+
if (cmd & (IOC_IN | IOC_OUT)) {
if (asize <= sizeof(stack_kdata)) {
kdata = stack_kdata;
@@ -2061,12 +2873,11 @@ static int kfd_mmio_mmap(struct kfd_dev *dev, struct kfd_process *process,
struct vm_area_struct *vma)
{
phys_addr_t address;
- int ret;
if (vma->vm_end - vma->vm_start != PAGE_SIZE)
return -EINVAL;
- address = amdgpu_amdkfd_get_mmio_remap_phys_addr(dev->kgd);
+ address = dev->adev->rmmio_remap.bus_addr;
vma->vm_flags |= VM_IO | VM_DONTCOPY | VM_DONTEXPAND | VM_NORESERVE |
VM_DONTDUMP | VM_PFNMAP;
@@ -2081,12 +2892,11 @@ static int kfd_mmio_mmap(struct kfd_dev *dev, struct kfd_process *process,
process->pasid, (unsigned long long) vma->vm_start,
address, vma->vm_flags, PAGE_SIZE);
- ret = io_remap_pfn_range(vma,
+ return io_remap_pfn_range(vma,
vma->vm_start,
address >> PAGE_SHIFT,
PAGE_SIZE,
vma->vm_page_prot);
- return ret;
}