diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 104 |
1 files changed, 97 insertions, 7 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c index 543e7ea75593..0b75a37b689b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c @@ -1,5 +1,6 @@ +// SPDX-License-Identifier: GPL-2.0 OR MIT /* - * Copyright 2016-2018 Advanced Micro Devices, Inc. + * Copyright 2016-2022 Advanced Micro Devices, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), @@ -89,6 +90,76 @@ enum SQ_INTERRUPT_ERROR_TYPE { #define KFD_SQ_INT_DATA__ERR_TYPE_MASK 0xF00000 #define KFD_SQ_INT_DATA__ERR_TYPE__SHIFT 20 +static void event_interrupt_poison_consumption_v9(struct kfd_dev *dev, + uint16_t pasid, uint16_t client_id) +{ + int old_poison, ret = -EINVAL; + struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); + + if (!p) + return; + + /* all queues of a process will be unmapped in one time */ + old_poison = atomic_cmpxchg(&p->poison, 0, 1); + kfd_unref_process(p); + if (old_poison) + return; + + switch (client_id) { + case SOC15_IH_CLIENTID_SE0SH: + case SOC15_IH_CLIENTID_SE1SH: + case SOC15_IH_CLIENTID_SE2SH: + case SOC15_IH_CLIENTID_SE3SH: + case SOC15_IH_CLIENTID_UTCL2: + ret = kfd_dqm_evict_pasid(dev->dqm, pasid); + break; + case SOC15_IH_CLIENTID_SDMA0: + case SOC15_IH_CLIENTID_SDMA1: + case SOC15_IH_CLIENTID_SDMA2: + case SOC15_IH_CLIENTID_SDMA3: + case SOC15_IH_CLIENTID_SDMA4: + break; + default: + break; + } + + kfd_signal_poison_consumed_event(dev, pasid); + + /* resetting queue passes, do page retirement without gpu reset + * resetting queue fails, fallback to gpu reset solution + */ + if (!ret) { + dev_warn(dev->adev->dev, + "RAS poison consumption, unmap queue flow succeeded: client id %d\n", + client_id); + amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, false); + } else { + dev_warn(dev->adev->dev, + "RAS poison consumption, fall back to gpu reset flow: client id %d\n", + client_id); + amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, true); + } +} + +static bool context_id_expected(struct kfd_dev *dev) +{ + switch (KFD_GC_VERSION(dev)) { + case IP_VERSION(9, 0, 1): + return dev->mec_fw_version >= 0x817a; + case IP_VERSION(9, 1, 0): + case IP_VERSION(9, 2, 1): + case IP_VERSION(9, 2, 2): + case IP_VERSION(9, 3, 0): + case IP_VERSION(9, 4, 0): + return dev->mec_fw_version >= 0x17a; + default: + /* Other GFXv9 and later GPUs always sent valid context IDs + * on legitimate events + */ + return KFD_GC_VERSION(dev) >= IP_VERSION(9, 4, 1); + } +} + static bool event_interrupt_isr_v9(struct kfd_dev *dev, const uint32_t *ih_ring_entry, uint32_t *patched_ihre, @@ -135,7 +206,7 @@ static bool event_interrupt_isr_v9(struct kfd_dev *dev, *patched_flag = true; memcpy(patched_ihre, ih_ring_entry, - dev->device_info->ih_ring_entry_size); + dev->device_info.ih_ring_entry_size); pasid = dev->dqm->vmid_pasid[vmid]; @@ -154,11 +225,26 @@ static bool event_interrupt_isr_v9(struct kfd_dev *dev, if (WARN_ONCE(pasid == 0, "Bug: No PASID in KFD interrupt")) return false; + /* Workaround CP firmware sending bogus signals with 0 context_id. + * Those can be safely ignored on hardware and firmware versions that + * include a valid context_id on legitimate signals. This avoids the + * slow path in kfd_signal_event_interrupt that scans all event slots + * for signaled events. + */ + if (source_id == SOC15_INTSRC_CP_END_OF_PIPE) { + uint32_t context_id = + SOC15_CONTEXT_ID0_FROM_IH_ENTRY(ih_ring_entry); + + if (context_id == 0 && context_id_expected(dev)) + return false; + } + /* Interrupt types we care about: various signals and faults. * They will be forwarded to a work queue (see below). */ return source_id == SOC15_INTSRC_CP_END_OF_PIPE || source_id == SOC15_INTSRC_SDMA_TRAP || + source_id == SOC15_INTSRC_SDMA_ECC || source_id == SOC15_INTSRC_SQ_INTERRUPT_MSG || source_id == SOC15_INTSRC_CP_BAD_OPCODE || ((client_id == SOC15_IH_CLIENTID_VMC || @@ -230,8 +316,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev, sq_intr_err); if (sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_ILLEGAL_INST && sq_intr_err != SQ_INTERRUPT_ERROR_TYPE_MEMVIOL) { - kfd_signal_poison_consumed_event(dev, pasid); - amdgpu_amdkfd_ras_poison_consumption_handler(dev->kgd); + event_interrupt_poison_consumption_v9(dev, pasid, client_id); return; } break; @@ -252,8 +337,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev, if (source_id == SOC15_INTSRC_SDMA_TRAP) { kfd_signal_event_interrupt(pasid, context_id0 & 0xfffffff, 28); } else if (source_id == SOC15_INTSRC_SDMA_ECC) { - kfd_signal_poison_consumed_event(dev, pasid); - amdgpu_amdkfd_ras_poison_consumption_handler(dev->kgd); + event_interrupt_poison_consumption_v9(dev, pasid, client_id); return; } } else if (client_id == SOC15_IH_CLIENTID_VMC || @@ -262,6 +346,12 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev, struct kfd_vm_fault_info info = {0}; uint16_t ring_id = SOC15_RING_ID_FROM_IH_ENTRY(ih_ring_entry); + if (client_id == SOC15_IH_CLIENTID_UTCL2 && + amdgpu_amdkfd_ras_query_utcl2_poison_status(dev->adev)) { + event_interrupt_poison_consumption_v9(dev, pasid, client_id); + return; + } + info.vmid = vmid; info.mc_id = client_id; info.page_addr = ih_ring_entry[4] | @@ -271,7 +361,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev, info.prot_write = ring_id & 0x20; kfd_smi_event_update_vmfault(dev, pasid); - kfd_process_vm_fault(dev->dqm, pasid); + kfd_dqm_evict_pasid(dev->dqm, pasid); kfd_signal_vm_fault_event(dev, pasid, &info); } } |