aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c')
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c283
1 files changed, 177 insertions, 106 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
index 668ad07ebe1f..ec0d62a16e53 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
@@ -61,6 +61,8 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
static void deallocate_sdma_queue(struct device_queue_manager *dqm,
unsigned int sdma_queue_id);
+static void kfd_process_hw_exception(struct work_struct *work);
+
static inline
enum KFD_MQD_TYPE get_mqd_type_from_queue_type(enum kfd_queue_type type)
{
@@ -99,6 +101,17 @@ unsigned int get_pipes_per_mec(struct device_queue_manager *dqm)
return dqm->dev->shared_resources.num_pipe_per_mec;
}
+static unsigned int get_num_sdma_engines(struct device_queue_manager *dqm)
+{
+ return dqm->dev->device_info->num_sdma_engines;
+}
+
+unsigned int get_num_sdma_queues(struct device_queue_manager *dqm)
+{
+ return dqm->dev->device_info->num_sdma_engines
+ * KFD_SDMA_QUEUES_PER_ENGINE;
+}
+
void program_sh_mem_settings(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
@@ -240,7 +253,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
print_queue(q);
- mutex_lock(&dqm->lock);
+ dqm_lock(dqm);
if (dqm->total_queue_count >= max_num_of_queues_per_device) {
pr_warn("Can't create new usermode queue because %d queues were already created\n",
@@ -297,7 +310,7 @@ static int create_queue_nocpsch(struct device_queue_manager *dqm,
dqm->total_queue_count);
out_unlock:
- mutex_unlock(&dqm->lock);
+ dqm_unlock(dqm);
return retval;
}
@@ -346,10 +359,10 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
int retval;
- struct mqd_manager *mqd;
+ struct mqd_manager *mqd_mgr;
- mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE);
- if (!mqd)
+ mqd_mgr = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_COMPUTE);
+ if (!mqd_mgr)
return -ENOMEM;
retval = allocate_hqd(dqm, q);
@@ -360,7 +373,7 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
if (retval)
goto out_deallocate_hqd;
- retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj,
+ retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
&q->gart_mqd_addr, &q->properties);
if (retval)
goto out_deallocate_doorbell;
@@ -374,15 +387,15 @@ static int create_compute_queue_nocpsch(struct device_queue_manager *dqm,
if (!q->properties.is_active)
return 0;
- retval = mqd->load_mqd(mqd, q->mqd, q->pipe, q->queue, &q->properties,
- q->process->mm);
+ retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue,
+ &q->properties, q->process->mm);
if (retval)
goto out_uninit_mqd;
return 0;
out_uninit_mqd:
- mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj);
+ mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
out_deallocate_doorbell:
deallocate_doorbell(qpd, q);
out_deallocate_hqd:
@@ -399,11 +412,11 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
struct queue *q)
{
int retval;
- struct mqd_manager *mqd;
+ struct mqd_manager *mqd_mgr;
- mqd = dqm->ops.get_mqd_manager(dqm,
+ mqd_mgr = dqm->ops.get_mqd_manager(dqm,
get_mqd_type_from_queue_type(q->properties.type));
- if (!mqd)
+ if (!mqd_mgr)
return -ENOMEM;
if (q->properties.type == KFD_QUEUE_TYPE_COMPUTE) {
@@ -420,14 +433,14 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm,
deallocate_doorbell(qpd, q);
- retval = mqd->destroy_mqd(mqd, q->mqd,
+ retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
KFD_PREEMPT_TYPE_WAVEFRONT_RESET,
KFD_UNMAP_LATENCY_MS,
q->pipe, q->queue);
if (retval == -ETIME)
qpd->reset_wavefronts = true;
- mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj);
+ mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
list_del(&q->list);
if (list_empty(&qpd->queues_list)) {
@@ -457,9 +470,9 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm,
{
int retval;
- mutex_lock(&dqm->lock);
+ dqm_lock(dqm);
retval = destroy_queue_nocpsch_locked(dqm, qpd, q);
- mutex_unlock(&dqm->lock);
+ dqm_unlock(dqm);
return retval;
}
@@ -467,19 +480,19 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm,
static int update_queue(struct device_queue_manager *dqm, struct queue *q)
{
int retval;
- struct mqd_manager *mqd;
+ struct mqd_manager *mqd_mgr;
struct kfd_process_device *pdd;
bool prev_active = false;
- mutex_lock(&dqm->lock);
+ dqm_lock(dqm);
pdd = kfd_get_process_device_data(q->device, q->process);
if (!pdd) {
retval = -ENODEV;
goto out_unlock;
}
- mqd = dqm->ops.get_mqd_manager(dqm,
+ mqd_mgr = dqm->ops.get_mqd_manager(dqm,
get_mqd_type_from_queue_type(q->properties.type));
- if (!mqd) {
+ if (!mqd_mgr) {
retval = -ENOMEM;
goto out_unlock;
}
@@ -506,7 +519,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
} else if (prev_active &&
(q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
q->properties.type == KFD_QUEUE_TYPE_SDMA)) {
- retval = mqd->destroy_mqd(mqd, q->mqd,
+ retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
if (retval) {
@@ -515,7 +528,7 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
}
}
- retval = mqd->update_mqd(mqd, q->mqd, &q->properties);
+ retval = mqd_mgr->update_mqd(mqd_mgr, q->mqd, &q->properties);
/*
* check active state vs. the previous state and modify
@@ -533,44 +546,44 @@ static int update_queue(struct device_queue_manager *dqm, struct queue *q)
else if (q->properties.is_active &&
(q->properties.type == KFD_QUEUE_TYPE_COMPUTE ||
q->properties.type == KFD_QUEUE_TYPE_SDMA))
- retval = mqd->load_mqd(mqd, q->mqd, q->pipe, q->queue,
+ retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe, q->queue,
&q->properties, q->process->mm);
out_unlock:
- mutex_unlock(&dqm->lock);
+ dqm_unlock(dqm);
return retval;
}
static struct mqd_manager *get_mqd_manager(
struct device_queue_manager *dqm, enum KFD_MQD_TYPE type)
{
- struct mqd_manager *mqd;
+ struct mqd_manager *mqd_mgr;
if (WARN_ON(type >= KFD_MQD_TYPE_MAX))
return NULL;
pr_debug("mqd type %d\n", type);
- mqd = dqm->mqds[type];
- if (!mqd) {
- mqd = mqd_manager_init(type, dqm->dev);
- if (!mqd)
+ mqd_mgr = dqm->mqd_mgrs[type];
+ if (!mqd_mgr) {
+ mqd_mgr = mqd_manager_init(type, dqm->dev);
+ if (!mqd_mgr)
pr_err("mqd manager is NULL");
- dqm->mqds[type] = mqd;
+ dqm->mqd_mgrs[type] = mqd_mgr;
}
- return mqd;
+ return mqd_mgr;
}
static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
struct queue *q;
- struct mqd_manager *mqd;
+ struct mqd_manager *mqd_mgr;
struct kfd_process_device *pdd;
int retval = 0;
- mutex_lock(&dqm->lock);
+ dqm_lock(dqm);
if (qpd->evicted++ > 0) /* already evicted, do nothing */
goto out;
@@ -582,16 +595,16 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
list_for_each_entry(q, &qpd->queues_list, list) {
if (!q->properties.is_active)
continue;
- mqd = dqm->ops.get_mqd_manager(dqm,
+ mqd_mgr = dqm->ops.get_mqd_manager(dqm,
get_mqd_type_from_queue_type(q->properties.type));
- if (!mqd) { /* should not be here */
+ if (!mqd_mgr) { /* should not be here */
pr_err("Cannot evict queue, mqd mgr is NULL\n");
retval = -ENOMEM;
goto out;
}
q->properties.is_evicted = true;
q->properties.is_active = false;
- retval = mqd->destroy_mqd(mqd, q->mqd,
+ retval = mqd_mgr->destroy_mqd(mqd_mgr, q->mqd,
KFD_PREEMPT_TYPE_WAVEFRONT_DRAIN,
KFD_UNMAP_LATENCY_MS, q->pipe, q->queue);
if (retval)
@@ -600,7 +613,7 @@ static int evict_process_queues_nocpsch(struct device_queue_manager *dqm,
}
out:
- mutex_unlock(&dqm->lock);
+ dqm_unlock(dqm);
return retval;
}
@@ -611,7 +624,7 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
struct kfd_process_device *pdd;
int retval = 0;
- mutex_lock(&dqm->lock);
+ dqm_lock(dqm);
if (qpd->evicted++ > 0) /* already evicted, do nothing */
goto out;
@@ -633,7 +646,7 @@ static int evict_process_queues_cpsch(struct device_queue_manager *dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
out:
- mutex_unlock(&dqm->lock);
+ dqm_unlock(dqm);
return retval;
}
@@ -641,7 +654,7 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
struct qcm_process_device *qpd)
{
struct queue *q;
- struct mqd_manager *mqd;
+ struct mqd_manager *mqd_mgr;
struct kfd_process_device *pdd;
uint32_t pd_base;
int retval = 0;
@@ -650,7 +663,7 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
/* Retrieve PD base */
pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm);
- mutex_lock(&dqm->lock);
+ dqm_lock(dqm);
if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing */
goto out;
if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */
@@ -677,16 +690,16 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
list_for_each_entry(q, &qpd->queues_list, list) {
if (!q->properties.is_evicted)
continue;
- mqd = dqm->ops.get_mqd_manager(dqm,
+ mqd_mgr = dqm->ops.get_mqd_manager(dqm,
get_mqd_type_from_queue_type(q->properties.type));
- if (!mqd) { /* should not be here */
+ if (!mqd_mgr) { /* should not be here */
pr_err("Cannot restore queue, mqd mgr is NULL\n");
retval = -ENOMEM;
goto out;
}
q->properties.is_evicted = false;
q->properties.is_active = true;
- retval = mqd->load_mqd(mqd, q->mqd, q->pipe,
+ retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, q->pipe,
q->queue, &q->properties,
q->process->mm);
if (retval)
@@ -695,7 +708,7 @@ static int restore_process_queues_nocpsch(struct device_queue_manager *dqm,
}
qpd->evicted = 0;
out:
- mutex_unlock(&dqm->lock);
+ dqm_unlock(dqm);
return retval;
}
@@ -711,7 +724,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
/* Retrieve PD base */
pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm);
- mutex_lock(&dqm->lock);
+ dqm_lock(dqm);
if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing */
goto out;
if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */
@@ -739,7 +752,7 @@ static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
if (!retval)
qpd->evicted = 0;
out:
- mutex_unlock(&dqm->lock);
+ dqm_unlock(dqm);
return retval;
}
@@ -761,7 +774,7 @@ static int register_process(struct device_queue_manager *dqm,
/* Retrieve PD base */
pd_base = dqm->dev->kfd2kgd->get_process_page_dir(pdd->vm);
- mutex_lock(&dqm->lock);
+ dqm_lock(dqm);
list_add(&n->list, &dqm->queues);
/* Update PD Base in QPD */
@@ -769,9 +782,10 @@ static int register_process(struct device_queue_manager *dqm,
retval = dqm->asic_ops.update_qpd(dqm, qpd);
- dqm->processes_count++;
+ if (dqm->processes_count++ == 0)
+ dqm->dev->kfd2kgd->set_compute_idle(dqm->dev->kgd, false);
- mutex_unlock(&dqm->lock);
+ dqm_unlock(dqm);
return retval;
}
@@ -786,20 +800,22 @@ static int unregister_process(struct device_queue_manager *dqm,
list_empty(&qpd->queues_list) ? "empty" : "not empty");
retval = 0;
- mutex_lock(&dqm->lock);
+ dqm_lock(dqm);
list_for_each_entry_safe(cur, next, &dqm->queues, list) {
if (qpd == cur->qpd) {
list_del(&cur->list);
kfree(cur);
- dqm->processes_count--;
+ if (--dqm->processes_count == 0)
+ dqm->dev->kfd2kgd->set_compute_idle(
+ dqm->dev->kgd, true);
goto out;
}
}
/* qpd not found in dqm list */
retval = 1;
out:
- mutex_unlock(&dqm->lock);
+ dqm_unlock(dqm);
return retval;
}
@@ -838,7 +854,7 @@ static int initialize_nocpsch(struct device_queue_manager *dqm)
if (!dqm->allocated_queues)
return -ENOMEM;
- mutex_init(&dqm->lock);
+ mutex_init(&dqm->lock_hidden);
INIT_LIST_HEAD(&dqm->queues);
dqm->queue_count = dqm->next_pipe_to_allocate = 0;
dqm->sdma_queue_count = 0;
@@ -853,7 +869,7 @@ static int initialize_nocpsch(struct device_queue_manager *dqm)
}
dqm->vmid_bitmap = (1 << dqm->dev->vm_info.vmid_num_kfd) - 1;
- dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1;
+ dqm->sdma_bitmap = (1 << get_num_sdma_queues(dqm)) - 1;
return 0;
}
@@ -866,8 +882,8 @@ static void uninitialize(struct device_queue_manager *dqm)
kfree(dqm->allocated_queues);
for (i = 0 ; i < KFD_MQD_TYPE_MAX ; i++)
- kfree(dqm->mqds[i]);
- mutex_destroy(&dqm->lock);
+ kfree(dqm->mqd_mgrs[i]);
+ mutex_destroy(&dqm->lock_hidden);
kfd_gtt_sa_free(dqm->dev, dqm->pipeline_mem);
}
@@ -901,7 +917,7 @@ static int allocate_sdma_queue(struct device_queue_manager *dqm,
static void deallocate_sdma_queue(struct device_queue_manager *dqm,
unsigned int sdma_queue_id)
{
- if (sdma_queue_id >= CIK_SDMA_QUEUES)
+ if (sdma_queue_id >= get_num_sdma_queues(dqm))
return;
dqm->sdma_bitmap |= (1 << sdma_queue_id);
}
@@ -910,19 +926,19 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
struct queue *q,
struct qcm_process_device *qpd)
{
- struct mqd_manager *mqd;
+ struct mqd_manager *mqd_mgr;
int retval;
- mqd = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_SDMA);
- if (!mqd)
+ mqd_mgr = dqm->ops.get_mqd_manager(dqm, KFD_MQD_TYPE_SDMA);
+ if (!mqd_mgr)
return -ENOMEM;
retval = allocate_sdma_queue(dqm, &q->sdma_id);
if (retval)
return retval;
- q->properties.sdma_queue_id = q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE;
- q->properties.sdma_engine_id = q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE;
+ q->properties.sdma_queue_id = q->sdma_id / get_num_sdma_engines(dqm);
+ q->properties.sdma_engine_id = q->sdma_id % get_num_sdma_engines(dqm);
retval = allocate_doorbell(qpd, q);
if (retval)
@@ -933,19 +949,20 @@ static int create_sdma_queue_nocpsch(struct device_queue_manager *dqm,
pr_debug("SDMA engine id: %d\n", q->properties.sdma_engine_id);
dqm->asic_ops.init_sdma_vm(dqm, q, qpd);
- retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj,
+ retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
&q->gart_mqd_addr, &q->properties);
if (retval)
goto out_deallocate_doorbell;
- retval = mqd->load_mqd(mqd, q->mqd, 0, 0, &q->properties, NULL);
+ retval = mqd_mgr->load_mqd(mqd_mgr, q->mqd, 0, 0, &q->properties,
+ NULL);
if (retval)
goto out_uninit_mqd;
return 0;
out_uninit_mqd:
- mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj);
+ mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
out_deallocate_doorbell:
deallocate_doorbell(qpd, q);
out_deallocate_sdma_queue:
@@ -1003,12 +1020,14 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
{
pr_debug("num of pipes: %d\n", get_pipes_per_mec(dqm));
- mutex_init(&dqm->lock);
+ mutex_init(&dqm->lock_hidden);
INIT_LIST_HEAD(&dqm->queues);
dqm->queue_count = dqm->processes_count = 0;
dqm->sdma_queue_count = 0;
dqm->active_runlist = false;
- dqm->sdma_bitmap = (1 << CIK_SDMA_QUEUES) - 1;
+ dqm->sdma_bitmap = (1 << get_num_sdma_queues(dqm)) - 1;
+
+ INIT_WORK(&dqm->hw_exception_work, kfd_process_hw_exception);
return 0;
}
@@ -1041,9 +1060,11 @@ static int start_cpsch(struct device_queue_manager *dqm)
init_interrupts(dqm);
- mutex_lock(&dqm->lock);
+ dqm_lock(dqm);
+ /* clear hang status when driver try to start the hw scheduler */
+ dqm->is_hws_hang = false;
execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
- mutex_unlock(&dqm->lock);
+ dqm_unlock(dqm);
return 0;
fail_allocate_vidmem:
@@ -1055,9 +1076,9 @@ fail_packet_manager_init:
static int stop_cpsch(struct device_queue_manager *dqm)
{
- mutex_lock(&dqm->lock);
+ dqm_lock(dqm);
unmap_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
- mutex_unlock(&dqm->lock);
+ dqm_unlock(dqm);
kfd_gtt_sa_free(dqm->dev, dqm->fence_mem);
pm_uninit(&dqm->packets);
@@ -1069,11 +1090,11 @@ static int create_kernel_queue_cpsch(struct device_queue_manager *dqm,
struct kernel_queue *kq,
struct qcm_process_device *qpd)
{
- mutex_lock(&dqm->lock);
+ dqm_lock(dqm);
if (dqm->total_queue_count >= max_num_of_queues_per_device) {
pr_warn("Can't create new kernel queue because %d queues were already created\n",
dqm->total_queue_count);
- mutex_unlock(&dqm->lock);
+ dqm_unlock(dqm);
return -EPERM;
}
@@ -1089,7 +1110,7 @@ static int create_kernel_queue_cpsch(struct device_queue_manager *dqm,
dqm->queue_count++;
qpd->is_debug = true;
execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0);
- mutex_unlock(&dqm->lock);
+ dqm_unlock(dqm);
return 0;
}
@@ -1098,7 +1119,7 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm,
struct kernel_queue *kq,
struct qcm_process_device *qpd)
{
- mutex_lock(&dqm->lock);
+ dqm_lock(dqm);
list_del(&kq->list);
dqm->queue_count--;
qpd->is_debug = false;
@@ -1110,18 +1131,18 @@ static void destroy_kernel_queue_cpsch(struct device_queue_manager *dqm,
dqm->total_queue_count--;
pr_debug("Total of %d queues are accountable so far\n",
dqm->total_queue_count);
- mutex_unlock(&dqm->lock);
+ dqm_unlock(dqm);
}
static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
struct qcm_process_device *qpd)
{
int retval;
- struct mqd_manager *mqd;
+ struct mqd_manager *mqd_mgr;
retval = 0;
- mutex_lock(&dqm->lock);
+ dqm_lock(dqm);
if (dqm->total_queue_count >= max_num_of_queues_per_device) {
pr_warn("Can't create new usermode queue because %d queues were already created\n",
@@ -1135,19 +1156,19 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
if (retval)
goto out_unlock;
q->properties.sdma_queue_id =
- q->sdma_id / CIK_SDMA_QUEUES_PER_ENGINE;
+ q->sdma_id / get_num_sdma_engines(dqm);
q->properties.sdma_engine_id =
- q->sdma_id % CIK_SDMA_QUEUES_PER_ENGINE;
+ q->sdma_id % get_num_sdma_engines(dqm);
}
retval = allocate_doorbell(qpd, q);
if (retval)
goto out_deallocate_sdma_queue;
- mqd = dqm->ops.get_mqd_manager(dqm,
+ mqd_mgr = dqm->ops.get_mqd_manager(dqm,
get_mqd_type_from_queue_type(q->properties.type));
- if (!mqd) {
+ if (!mqd_mgr) {
retval = -ENOMEM;
goto out_deallocate_doorbell;
}
@@ -1164,7 +1185,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
q->properties.tba_addr = qpd->tba_addr;
q->properties.tma_addr = qpd->tma_addr;
- retval = mqd->init_mqd(mqd, &q->mqd, &q->mqd_mem_obj,
+ retval = mqd_mgr->init_mqd(mqd_mgr, &q->mqd, &q->mqd_mem_obj,
&q->gart_mqd_addr, &q->properties);
if (retval)
goto out_deallocate_doorbell;
@@ -1188,7 +1209,7 @@ static int create_queue_cpsch(struct device_queue_manager *dqm, struct queue *q,
pr_debug("Total of %d queues are accountable so far\n",
dqm->total_queue_count);
- mutex_unlock(&dqm->lock);
+ dqm_unlock(dqm);
return retval;
out_deallocate_doorbell:
@@ -1197,7 +1218,8 @@ out_deallocate_sdma_queue:
if (q->properties.type == KFD_QUEUE_TYPE_SDMA)
deallocate_sdma_queue(dqm, q->sdma_id);
out_unlock:
- mutex_unlock(&dqm->lock);
+ dqm_unlock(dqm);
+
return retval;
}
@@ -1210,6 +1232,13 @@ int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
while (*fence_addr != fence_value) {
if (time_after(jiffies, end_jiffies)) {
pr_err("qcm fence wait loop timeout expired\n");
+ /* In HWS case, this is used to halt the driver thread
+ * in order not to mess up CP states before doing
+ * scandumps for FW debugging.
+ */
+ while (halt_if_hws_hang)
+ schedule();
+
return -ETIME;
}
schedule();
@@ -1254,6 +1283,8 @@ static int unmap_queues_cpsch(struct device_queue_manager *dqm,
{
int retval = 0;
+ if (dqm->is_hws_hang)
+ return -EIO;
if (!dqm->active_runlist)
return retval;
@@ -1292,9 +1323,13 @@ static int execute_queues_cpsch(struct device_queue_manager *dqm,
{
int retval;
+ if (dqm->is_hws_hang)
+ return -EIO;
retval = unmap_queues_cpsch(dqm, filter, filter_param);
if (retval) {
pr_err("The cp might be in an unrecoverable state due to an unsuccessful queues preemption\n");
+ dqm->is_hws_hang = true;
+ schedule_work(&dqm->hw_exception_work);
return retval;
}
@@ -1306,7 +1341,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
struct queue *q)
{
int retval;
- struct mqd_manager *mqd;
+ struct mqd_manager *mqd_mgr;
bool preempt_all_queues;
preempt_all_queues = false;
@@ -1314,7 +1349,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
retval = 0;
/* remove queue from list to prevent rescheduling after preemption */
- mutex_lock(&dqm->lock);
+ dqm_lock(dqm);
if (qpd->is_debug) {
/*
@@ -1326,9 +1361,9 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
}
- mqd = dqm->ops.get_mqd_manager(dqm,
+ mqd_mgr = dqm->ops.get_mqd_manager(dqm,
get_mqd_type_from_queue_type(q->properties.type));
- if (!mqd) {
+ if (!mqd_mgr) {
retval = -ENOMEM;
goto failed;
}
@@ -1350,7 +1385,7 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
qpd->reset_wavefronts = true;
}
- mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj);
+ mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
/*
* Unconditionally decrement this counter, regardless of the queue's
@@ -1360,14 +1395,14 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm,
pr_debug("Total of %d queues are accountable so far\n",
dqm->total_queue_count);
- mutex_unlock(&dqm->lock);
+ dqm_unlock(dqm);
return retval;
failed:
failed_try_destroy_debugged_queue:
- mutex_unlock(&dqm->lock);
+ dqm_unlock(dqm);
return retval;
}
@@ -1391,7 +1426,7 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm,
if (!dqm->asic_ops.set_cache_memory_policy)
return retval;
- mutex_lock(&dqm->lock);
+ dqm_lock(dqm);
if (alternate_aperture_size == 0) {
/* base > limit disables APE1 */
@@ -1437,7 +1472,7 @@ static bool set_cache_memory_policy(struct device_queue_manager *dqm,
qpd->sh_mem_ape1_limit);
out:
- mutex_unlock(&dqm->lock);
+ dqm_unlock(dqm);
return retval;
}
@@ -1468,7 +1503,7 @@ static int process_termination_nocpsch(struct device_queue_manager *dqm,
struct device_process_node *cur, *next_dpn;
int retval = 0;
- mutex_lock(&dqm->lock);
+ dqm_lock(dqm);
/* Clear all user mode queues */
list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
@@ -1489,7 +1524,7 @@ static int process_termination_nocpsch(struct device_queue_manager *dqm,
}
}
- mutex_unlock(&dqm->lock);
+ dqm_unlock(dqm);
return retval;
}
@@ -1500,14 +1535,14 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
int retval;
struct queue *q, *next;
struct kernel_queue *kq, *kq_next;
- struct mqd_manager *mqd;
+ struct mqd_manager *mqd_mgr;
struct device_process_node *cur, *next_dpn;
enum kfd_unmap_queues_filter filter =
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES;
retval = 0;
- mutex_lock(&dqm->lock);
+ dqm_lock(dqm);
/* Clean all kernel queues */
list_for_each_entry_safe(kq, kq_next, &qpd->priv_queue_list, list) {
@@ -1542,7 +1577,7 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
}
retval = execute_queues_cpsch(dqm, filter, 0);
- if (retval || qpd->reset_wavefronts) {
+ if ((!dqm->is_hws_hang) && (retval || qpd->reset_wavefronts)) {
pr_warn("Resetting wave fronts (cpsch) on dev %p\n", dqm->dev);
dbgdev_wave_reset_wavefronts(dqm->dev, qpd->pqm->process);
qpd->reset_wavefronts = false;
@@ -1550,19 +1585,19 @@ static int process_termination_cpsch(struct device_queue_manager *dqm,
/* lastly, free mqd resources */
list_for_each_entry_safe(q, next, &qpd->queues_list, list) {
- mqd = dqm->ops.get_mqd_manager(dqm,
+ mqd_mgr = dqm->ops.get_mqd_manager(dqm,
get_mqd_type_from_queue_type(q->properties.type));
- if (!mqd) {
+ if (!mqd_mgr) {
retval = -ENOMEM;
goto out;
}
list_del(&q->list);
qpd->queue_count--;
- mqd->uninit_mqd(mqd, q->mqd, q->mqd_mem_obj);
+ mqd_mgr->uninit_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj);
}
out:
- mutex_unlock(&dqm->lock);
+ dqm_unlock(dqm);
return retval;
}
@@ -1683,6 +1718,30 @@ void device_queue_manager_uninit(struct device_queue_manager *dqm)
kfree(dqm);
}
+int kfd_process_vm_fault(struct device_queue_manager *dqm,
+ unsigned int pasid)
+{
+ struct kfd_process_device *pdd;
+ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+ int ret = 0;
+
+ if (!p)
+ return -EINVAL;
+ pdd = kfd_get_process_device_data(dqm->dev, p);
+ if (pdd)
+ ret = dqm->ops.evict_process_queues(dqm, &pdd->qpd);
+ kfd_unref_process(p);
+
+ return ret;
+}
+
+static void kfd_process_hw_exception(struct work_struct *work)
+{
+ struct device_queue_manager *dqm = container_of(work,
+ struct device_queue_manager, hw_exception_work);
+ dqm->dev->kfd2kgd->gpu_recover(dqm->dev->kgd);
+}
+
#if defined(CONFIG_DEBUG_FS)
static void seq_reg_dump(struct seq_file *m,
@@ -1746,8 +1805,8 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data)
}
}
- for (pipe = 0; pipe < CIK_SDMA_ENGINE_NUM; pipe++) {
- for (queue = 0; queue < CIK_SDMA_QUEUES_PER_ENGINE; queue++) {
+ for (pipe = 0; pipe < get_num_sdma_engines(dqm); pipe++) {
+ for (queue = 0; queue < KFD_SDMA_QUEUES_PER_ENGINE; queue++) {
r = dqm->dev->kfd2kgd->hqd_sdma_dump(
dqm->dev->kgd, pipe, queue, &dump, &n_regs);
if (r)
@@ -1764,4 +1823,16 @@ int dqm_debugfs_hqds(struct seq_file *m, void *data)
return r;
}
+int dqm_debugfs_execute_queues(struct device_queue_manager *dqm)
+{
+ int r = 0;
+
+ dqm_lock(dqm);
+ dqm->active_runlist = true;
+ r = execute_queues_cpsch(dqm, KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0);
+ dqm_unlock(dqm);
+
+ return r;
+}
+
#endif