aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c')
-rw-r--r--drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c104
1 files changed, 97 insertions, 7 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
index 543e7ea75593..0b75a37b689b 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0 OR MIT
/*
- * Copyright 2016-2018 Advanced Micro Devices, Inc.
+ * Copyright 2016-2022 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -89,6 +90,76 @@ enum SQ_INTERRUPT_ERROR_TYPE {
#define KFD_SQ_INT_DATA__ERR_TYPE_MASK 0xF00000
#define KFD_SQ_INT_DATA__ERR_TYPE__SHIFT 20
+static void event_interrupt_poison_consumption_v9(struct kfd_dev *dev,
+ uint16_t pasid, uint16_t client_id)
+{
+ int old_poison, ret = -EINVAL;
+ struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
+
+ if (!p)
+ return;
+
+ /* all queues of a process will be unmapped in one time */
+ old_poison = atomic_cmpxchg(&p->poison, 0, 1);
+ kfd_unref_process(p);
+ if (old_poison)
+ return;
+
+ switch (client_id) {
+ case SOC15_IH_CLIENTID_SE0SH:
+ case SOC15_IH_CLIENTID_SE1SH:
+ case SOC15_IH_CLIENTID_SE2SH:
+ case SOC15_IH_CLIENTID_SE3SH:
+ case SOC15_IH_CLIENTID_UTCL2:
+ ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
+ break;
+ case SOC15_IH_CLIENTID_SDMA0:
+ case SOC15_IH_CLIENTID_SDMA1:
+ case SOC15_IH_CLIENTID_SDMA2:
+ case SOC15_IH_CLIENTID_SDMA3:
+ case SOC15_IH_CLIENTID_SDMA4:
+ break;
+ default:
+ break;
+ }
+
+ kfd_signal_poison_consumed_event(dev, pasid);
+
+ /* resetting queue passes, do page retirement without gpu reset
+ * resetting queue fails, fallback to gpu reset solution
+ */
+ if (!ret) {
+ dev_warn(dev->adev->dev,
+ "RAS poison consumption, unmap queue flow succeeded: client id %d\n",
+ client_id);
+ amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false);
+ } else {
+ dev_warn(dev->adev->dev,
+ "RAS poison consumption, fall back to gpu reset flow: client id %d\n",
+ client_id);
+ amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true);
+ }
+}
+
+static bool context_id_expected(struct kfd_dev *dev)
+{
+ switch (KFD_GC_VERSION(dev)) {
+ case IP_VERSION(9, 0, 1):
+ return dev->mec_fw_version >= 0x817a;
+ case IP_VERSION(9, 1, 0):
+ case IP_VERSION(9, 2, 1):
+ case IP_VERSION(9, 2, 2):
+ case IP_VERSION(9, 3, 0):
+ case IP_VERSION(9, 4, 0):
+ return dev->mec_fw_version >= 0x17a;
+ default:
+ /* Other GFXv9 and later GPUs always sent valid context IDs
+ * on legitimate events
+ */
+ return KFD_GC_VERSION(dev) >= IP_VERSION(9, 4, 1);
+ }
+}
+
static bool event_interrupt_isr_v9(struct kfd_dev *dev,
const uint32_t *ih_ring_entry,
uint32_t *patched_ihre,
@@ -135,7 +206,7 @@ static bool event_interrupt_isr_v9(struct kfd_dev *dev,
*patched_flag = true;
memcpy(patched_ihre, ih_ring_entry,
- dev->device_info->ih_ring_entry_size);
+ dev->device_info.ih_ring_entry_size);
pasid = dev->dqm->vmid_pasid[vmid];
@@ -154,11 +225,26 @@ static bool event_interrupt_isr_v9(struct kfd_dev *dev,
if (WARN_ONCE(pasid == 0, "Bug: No PASID in KFD interrupt"))
return false;
+ /* Workaround CP firmware sending bogus signals with 0 context_id.
+ * Those can be safely ignored on hardware and firmware versions that
+ * include a valid context_id on legitimate signals. This avoids the
+ * slow path in kfd_signal_event_interrupt that scans all event slots
+ * for signaled events.
+ */
+ if (source_id == SOC15_INTSRC_CP_END_OF_PIPE) {
+ uint32_t context_id =
+ SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry);
+
+ if (context_id == 0 && context_id_expected(dev))
+ return false;
+ }
+
/* Interrupt types we care about: various signals and faults.
* They will be forwarded to a work queue (see below).
*/
return source_id == SOC15_INTSRC_CP_END_OF_PIPE ||
source_id == SOC15_INTSRC_SDMA_TRAP ||
+ source_id == SOC15_INTSRC_SDMA_ECC ||
source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG ||
source_id == SOC15_INTSRC_CP_BAD_OPCODE ||
((client_id == SOC15_IH_CLIENTID_VMC ||
@@ -230,8 +316,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
sq_intr_err);
if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST &&
sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) {
- kfd_signal_poison_consumed_event(dev, pasid);
- amdgpu_amdkfd_ras_poison_consumption_handler(dev->kgd);
+ event_interrupt_poison_consumption_v9(dev, pasid, client_id);
return;
}
break;
@@ -252,8 +337,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
if (source_id == SOC15_INTSRC_SDMA_TRAP) {
kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28);
} else if (source_id == SOC15_INTSRC_SDMA_ECC) {
- kfd_signal_poison_consumed_event(dev, pasid);
- amdgpu_amdkfd_ras_poison_consumption_handler(dev->kgd);
+ event_interrupt_poison_consumption_v9(dev, pasid, client_id);
return;
}
} else if (client_id == SOC15_IH_CLIENTID_VMC ||
@@ -262,6 +346,12 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
struct kfd_vm_fault_info info = {0};
uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry);
+ if (client_id == SOC15_IH_CLIENTID_UTCL2 &&
+ amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev)) {
+ event_interrupt_poison_consumption_v9(dev, pasid, client_id);
+ return;
+ }
+
info.vmid = vmid;
info.mc_id = client_id;
info.page_addr = ih_ring_entry[4] |
@@ -271,7 +361,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev,
info.prot_write = ring_id & 0x20;
kfd_smi_event_update_vmfault(dev, pasid);
- kfd_process_vm_fault(dev->dqm, pasid);
+ kfd_dqm_evict_pasid(dev->dqm, pasid);
kfd_signal_vm_fault_event(dev, pasid, &info);
}
}