diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdkfd')
23 files changed, 1707 insertions, 546 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index 61474627a32c..e1e4115dcf78 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -53,6 +53,7 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ $(AMDKFD_PATH)/kfd_int_process_v9.o \ $(AMDKFD_PATH)/kfd_dbgdev.o \ $(AMDKFD_PATH)/kfd_dbgmgr.o \ + $(AMDKFD_PATH)/kfd_smi_events.o \ $(AMDKFD_PATH)/kfd_crat.o ifneq ($(CONFIG_AMD_IOMMU_V2),) diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c index 9f59ba93cfe0..24b471734117 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c @@ -24,6 +24,7 @@ #include "kfd_events.h" #include "cik_int.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" static bool cik_event_interrupt_isr(struct kfd_dev *dev, const uint32_t *ih_ring_entry, @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev, ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { struct kfd_vm_fault_info info; + kfd_smi_event_update_vmfault(dev, pasid); kfd_process_vm_fault(dev->dqm, pasid); memset(&info, 0, sizeof(info)); diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h index d3400da6ab64..577d901fdb63 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h @@ -679,169 +679,175 @@ static const uint32_t cwsr_trap_gfx9_hex[] = { 0xbf810000, 0x00000000, }; -static const uint32_t cwsr_trap_gfx10_hex[] = { - 0xbf820001, 0xbf8201c1, +static const uint32_t cwsr_trap_nv1x_hex[] = { + 0xbf820001, 0xbf8201cd, 0xb0804004, 0xb978f802, - 0x8a788678, 0xb971f803, - 0x876eff71, 0x00000400, - 0xbf850033, 0x876eff71, - 0x00000100, 0xbf840002, - 0x8878ff78, 0x00002000, - 0x8a77ff77, 0xff000000, - 0xb96ef807, 0x876fff6e, - 0x02000000, 0x8f6f866f, - 0x88776f77, 0x876fff6e, - 0x003f8000, 0x8f6f896f, - 0x88776f77, 0x8a6eff6e, - 0x023f8000, 0xb9eef807, - 0xb97af812, 0xb97bf813, - 0x8ffa887a, 0xf4051bbd, - 0xfa000000, 0xbf8cc07f, - 0xf4051ebd, 0xfa000008, - 0xbf8cc07f, 0x87ee6e6e, - 0xbf840001, 0xbe80206e, - 0xb971f803, 0x8771ff71, - 0x000001ff, 0xbf850002, - 0x806c846c, 0x826d806d, - 0x876dff6d, 0x0000ffff, - 0x906e8977, 0x876fff6e, - 0x003f8000, 0x906e8677, - 0x876eff6e, 0x02000000, - 0x886e6f6e, 0xb9eef807, - 0x87fe7e7e, 0x87ea6a6a, - 0xb9f8f802, 0xbe80226c, - 0xb971f803, 0x8771ff71, - 0x00000100, 0xbf840006, - 0xbef60380, 0xb9f60203, - 0x876dff6d, 0x0000ffff, - 0x80ec886c, 0x82ed806d, - 0xbef60380, 0xb9f60283, - 0xb972f816, 0xb9762c07, - 0x8f769a76, 0x886d766d, - 0xb97603c7, 0x8f769976, - 0x886d766d, 0xb9760647, - 0x8f769876, 0x886d766d, - 0xb976f807, 0x8776ff76, - 0x00007fff, 0xb9f6f807, + 0x8a788678, 0xb96ef801, + 0x876eff6e, 0x00000800, + 0xbf840003, 0x876eff78, + 0x00002000, 0xbf840009, + 0xb97bf803, 0x876eff7b, + 0x00000400, 0xbf850033, + 0x876eff7b, 0x00000100, + 0xbf840002, 0x8878ff78, + 0x00002000, 0x8a77ff77, + 0xff000000, 0xb96ef807, + 0x876fff6e, 0x02000000, + 0x8f6f866f, 0x88776f77, + 0x876fff6e, 0x003f8000, + 0x8f6f896f, 0x88776f77, + 0x8a6eff6e, 0x023f8000, + 0xb9eef807, 0xb97af812, + 0xb97bf813, 0x8ffa887a, + 0xf4051bbd, 0xfa000000, + 0xbf8cc07f, 0xf4051ebd, + 0xfa000008, 0xbf8cc07f, + 0x87ee6e6e, 0xbf840001, + 0xbe80206e, 0xb97bf803, + 0x877bff7b, 0x000001ff, + 0xbf850002, 0x806c846c, + 0x826d806d, 0x876dff6d, + 0x0000ffff, 0x906e8977, + 0x876fff6e, 0x003f8000, + 0x906e8677, 0x876eff6e, + 0x02000000, 0x886e6f6e, + 0xb9eef807, 0x87fe7e7e, + 0x87ea6a6a, 0xb9f8f802, + 0xbe80226c, 0x876dff6d, + 0x0000ffff, 0xbefa0380, + 0xb9fa0283, 0xb97a2c07, + 0x8f7a9a7a, 0x886d7a6d, + 0xb97a03c7, 0x8f7a997a, + 0x886d7a6d, 0xb97a0647, + 0x8f7a987a, 0x886d7a6d, + 0xb97af807, 0x877aff7a, + 0x00007fff, 0xb9faf807, 0xbeee037e, 0xbeef037f, 0xbefe0480, 0xbf900004, 0xbf8e0002, 0xbf88fffe, + 0xb97b02dc, 0x8f7b997b, + 0x887b7b7f, 0xb97a2a05, + 0x807a817a, 0xbf0d997b, + 0xbf850002, 0x8f7a897a, + 0xbf820001, 0x8f7a8a7a, + 0x877bff7f, 0x0000ffff, + 0x807aff7a, 0x00000200, + 0x807a7e7a, 0x827b807b, + 0xf4491c3d, 0xfa000050, + 0xf4491d3d, 0xfa000060, + 0xf4411e7d, 0xfa000074, 0xbef4037e, 0x8775ff7f, 0x0000ffff, 0x8875ff75, 0x00040000, 0xbef60380, 0xbef703ff, 0x10807fac, - 0x8776ff7f, 0x08000000, - 0x90768376, 0x88777677, - 0x8776ff7f, 0x70000000, - 0x90768176, 0x88777677, - 0xbefb037c, 0xbefa0380, + 0x877aff7f, 0x08000000, + 0x907a837a, 0x88777a77, + 0x877aff7f, 0x70000000, + 0x907a817a, 0x88777a77, + 0xbef1037c, 0xbef00380, 0xb97302dc, 0x8f739973, - 0x8873737f, 0xb97a2a05, - 0x807a817a, 0x907c9973, - 0x877c817c, 0xbf06817c, - 0xbf850002, 0x8f7a897a, - 0xbf820001, 0x8f7a8a7a, - 0xb9761e06, 0x8f768a76, - 0x807a767a, 0x807aff7a, - 0x00000200, 0xbef603ff, - 0x01000000, 0xbefe037c, - 0xbefc037a, 0xf4611efa, - 0xf8000000, 0x807a847a, - 0xbefc037e, 0xbefe037c, - 0xbefc037a, 0xf4611b3a, - 0xf8000000, 0x807a847a, - 0xbefc037e, 0xbefe037c, - 0xbefc037a, 0xf4611b7a, - 0xf8000000, 0x807a847a, - 0xbefc037e, 0xbefe037c, - 0xbefc037a, 0xf4611bba, - 0xf8000000, 0x807a847a, - 0xbefc037e, 0xbefe037c, - 0xbefc037a, 0xf4611bfa, - 0xf8000000, 0x807a847a, - 0xbefc037e, 0xbefe037c, - 0xbefc037a, 0xf4611e3a, - 0xf8000000, 0x807a847a, - 0xbefc037e, 0xb971f803, - 0xbefe037c, 0xbefc037a, - 0xf4611c7a, 0xf8000000, - 0x807a847a, 0xbefc037e, - 0xbefe037c, 0xbefc037a, - 0xf4611cba, 0xf8000000, - 0x807a847a, 0xbefc037e, - 0xb97bf801, 0xbefe037c, - 0xbefc037a, 0xf4611efa, - 0xf8000000, 0x807a847a, - 0xbefc037e, 0xb97bf814, - 0xbefe037c, 0xbefc037a, - 0xf4611efa, 0xf8000000, - 0x807a847a, 0xbefc037e, - 0xb97bf815, 0xbefe037c, - 0xbefc037a, 0xf4611efa, - 0xf8000000, 0x807a847a, - 0xbefc037e, 0x8776ff7f, - 0x04000000, 0xbeef0380, - 0x886f6f76, 0xb97a2a05, - 0x807a817a, 0x907c9973, - 0x877c817c, 0xbf06817c, - 0xbf850002, 0x8f7a897a, - 0xbf820001, 0x8f7a8a7a, - 0xb9761e06, 0x8f768a76, - 0x807a767a, 0xbef603ff, - 0x01000000, 0xbef20374, - 0x80747a74, 0x82758075, - 0xbefc0380, 0xbf800000, - 0xbe802f00, 0xbe822f02, - 0xbe842f04, 0xbe862f06, - 0xbe882f08, 0xbe8a2f0a, - 0xbe8c2f0c, 0xbe8e2f0e, - 0xf469003a, 0xfa000000, - 0xf469013a, 0xfa000010, - 0xf469023a, 0xfa000020, - 0xf469033a, 0xfa000030, - 0x8074c074, 0x82758075, - 0x807c907c, 0xbf0aff7c, - 0x00000060, 0xbf85ffea, - 0xbe802f00, 0xbe822f02, - 0xbe842f04, 0xbe862f06, - 0xbe882f08, 0xbe8a2f0a, - 0xf469003a, 0xfa000000, - 0xf469013a, 0xfa000010, - 0xf469023a, 0xfa000020, - 0x8074b074, 0x82758075, - 0xbef40372, 0xbefa0380, + 0x8873737f, 0xb97bf816, + 0xba80f816, 0x00000000, 0xbefe03c1, 0x907c9973, 0x877c817c, 0xbf06817c, 0xbf850002, 0xbeff0380, 0xbf820002, 0xbeff03c1, 0xbf82000b, 0xbef603ff, 0x01000000, 0xe0704000, - 0x7a5d0000, 0xe0704080, - 0x7a5d0100, 0xe0704100, - 0x7a5d0200, 0xe0704180, - 0x7a5d0300, 0xbf82000a, + 0x705d0000, 0xe0704080, + 0x705d0100, 0xe0704100, + 0x705d0200, 0xe0704180, + 0x705d0300, 0xbf82000a, 0xbef603ff, 0x01000000, - 0xe0704000, 0x7a5d0000, - 0xe0704100, 0x7a5d0100, - 0xe0704200, 0x7a5d0200, - 0xe0704300, 0x7a5d0300, + 0xe0704000, 0x705d0000, + 0xe0704100, 0x705d0100, + 0xe0704200, 0x705d0200, + 0xe0704300, 0x705d0300, + 0xb9702a05, 0x80708170, + 0xbf0d9973, 0xbf850002, + 0x8f708970, 0xbf820001, + 0x8f708a70, 0xb97a1e06, + 0x8f7a8a7a, 0x80707a70, + 0x8070ff70, 0x00000200, + 0xbef603ff, 0x01000000, + 0xbefe037c, 0xbefc0370, + 0xf4611c7a, 0xf8000000, + 0x80708470, 0xbefc037e, + 0xbefe037c, 0xbefc0370, + 0xf4611b3a, 0xf8000000, + 0x80708470, 0xbefc037e, + 0xbefe037c, 0xbefc0370, + 0xf4611b7a, 0xf8000000, + 0x80708470, 0xbefc037e, + 0xbefe037c, 0xbefc0370, + 0xf4611bba, 0xf8000000, + 0x80708470, 0xbefc037e, + 0xbefe037c, 0xbefc0370, + 0xf4611bfa, 0xf8000000, + 0x80708470, 0xbefc037e, + 0xbefe037c, 0xbefc0370, + 0xf4611e3a, 0xf8000000, + 0x80708470, 0xbefc037e, + 0xb97af803, 0xbefe037c, + 0xbefc0370, 0xf4611eba, + 0xf8000000, 0x80708470, + 0xbefc037e, 0xbefe037c, + 0xbefc0370, 0xf4611efa, + 0xf8000000, 0x80708470, + 0xbefc037e, 0xb971f801, + 0xbefe037c, 0xbefc0370, + 0xf4611c7a, 0xf8000000, + 0x80708470, 0xbefc037e, + 0xb971f814, 0xbefe037c, + 0xbefc0370, 0xf4611c7a, + 0xf8000000, 0x80708470, + 0xbefc037e, 0xb971f815, + 0xbefe037c, 0xbefc0370, + 0xf4611c7a, 0xf8000000, + 0x80708470, 0xbefc037e, + 0xb9702a05, 0x80708170, + 0xbf0d9973, 0xbf850002, + 0x8f708970, 0xbf820001, + 0x8f708a70, 0xb97a1e06, + 0x8f7a8a7a, 0x80707a70, + 0xbef603ff, 0x01000000, + 0xbefb0374, 0x80747074, + 0x82758075, 0xbefc0380, + 0xbf800000, 0xbe802f00, + 0xbe822f02, 0xbe842f04, + 0xbe862f06, 0xbe882f08, + 0xbe8a2f0a, 0xbe8c2f0c, + 0xbe8e2f0e, 0xf469003a, + 0xfa000000, 0xf469013a, + 0xfa000010, 0xf469023a, + 0xfa000020, 0xf469033a, + 0xfa000030, 0x8074c074, + 0x82758075, 0x807c907c, + 0xbf0aff7c, 0x00000060, + 0xbf85ffea, 0xbe802f00, + 0xbe822f02, 0xbe842f04, + 0xbe862f06, 0xbe882f08, + 0xbe8a2f0a, 0xf469003a, + 0xfa000000, 0xf469013a, + 0xfa000010, 0xf469023a, + 0xfa000020, 0x8074b074, + 0x82758075, 0xbef4037b, 0xbefe03c1, 0x907c9973, 0x877c817c, 0xbf06817c, 0xbf850002, 0xbeff0380, 0xbf820001, 0xbeff03c1, - 0xb9714306, 0x8771c171, - 0xbf840046, 0xbf8a0000, - 0x8776ff6f, 0x04000000, - 0xbf840042, 0x8f718671, - 0x8f718271, 0xbef60371, - 0xb97a2a05, 0x807a817a, - 0x907c9973, 0x877c817c, - 0xbf06817c, 0xbf850002, - 0x8f7a897a, 0xbf820001, - 0x8f7a8a7a, 0xb9761e06, - 0x8f768a76, 0x807a767a, - 0x807aff7a, 0x00000200, - 0x807aff7a, 0x00000080, + 0xb97b4306, 0x877bc17b, + 0xbf840044, 0xbf8a0000, + 0x877aff73, 0x04000000, + 0xbf840040, 0x8f7b867b, + 0x8f7b827b, 0xbef6037b, + 0xb9702a05, 0x80708170, + 0xbf0d9973, 0xbf850002, + 0x8f708970, 0xbf820001, + 0x8f708a70, 0xb97a1e06, + 0x8f7a8a7a, 0x80707a70, + 0x8070ff70, 0x00000200, + 0x8070ff70, 0x00000080, 0xbef603ff, 0x01000000, 0xd7650000, 0x000100c1, 0xd7660000, 0x000200c1, @@ -852,87 +858,86 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xbf800000, 0xbf800000, 0xbf800000, 0xd8d80000, 0x01000000, 0xbf8c0000, - 0xe0704000, 0x7a5d0100, - 0x807c037c, 0x807a037a, + 0xe0704000, 0x705d0100, + 0x807c037c, 0x80700370, 0xd5250000, 0x0001ff00, - 0x00000080, 0xbf0a717c, + 0x00000080, 0xbf0a7b7c, 0xbf85fff4, 0xbf820011, 0xbe8303ff, 0x00000100, 0xbf800000, 0xbf800000, 0xbf800000, 0xd8d80000, 0x01000000, 0xbf8c0000, - 0xe0704000, 0x7a5d0100, - 0x807c037c, 0x807a037a, + 0xe0704000, 0x705d0100, + 0x807c037c, 0x80700370, 0xd5250000, 0x0001ff00, - 0x00000100, 0xbf0a717c, + 0x00000100, 0xbf0a7b7c, 0xbf85fff4, 0xbefe03c1, 0x907c9973, 0x877c817c, 0xbf06817c, 0xbf850004, - 0xbefa03ff, 0x00000200, + 0xbef003ff, 0x00000200, 0xbeff0380, 0xbf820003, - 0xbefa03ff, 0x00000400, - 0xbeff03c1, 0xb9712a05, - 0x80718171, 0x8f718271, + 0xbef003ff, 0x00000400, + 0xbeff03c1, 0xb97b2a05, + 0x807b817b, 0x8f7b827b, 0x907c9973, 0x877c817c, 0xbf06817c, 0xbf850017, 0xbef603ff, 0x01000000, - 0xbefc0384, 0xbf0a717c, + 0xbefc0384, 0xbf0a7b7c, 0xbf840037, 0x7e008700, 0x7e028701, 0x7e048702, 0x7e068703, 0xe0704000, - 0x7a5d0000, 0xe0704080, - 0x7a5d0100, 0xe0704100, - 0x7a5d0200, 0xe0704180, - 0x7a5d0300, 0x807c847c, - 0x807aff7a, 0x00000200, - 0xbf0a717c, 0xbf85ffef, + 0x705d0000, 0xe0704080, + 0x705d0100, 0xe0704100, + 0x705d0200, 0xe0704180, + 0x705d0300, 0x807c847c, + 0x8070ff70, 0x00000200, + 0xbf0a7b7c, 0xbf85ffef, 0xbf820025, 0xbef603ff, 0x01000000, 0xbefc0384, - 0xbf0a717c, 0xbf840020, + 0xbf0a7b7c, 0xbf840020, 0x7e008700, 0x7e028701, 0x7e048702, 0x7e068703, - 0xe0704000, 0x7a5d0000, - 0xe0704100, 0x7a5d0100, - 0xe0704200, 0x7a5d0200, - 0xe0704300, 0x7a5d0300, - 0x807c847c, 0x807aff7a, - 0x00000400, 0xbf0a717c, - 0xbf85ffef, 0xb9711e06, - 0x8771c171, 0xbf84000c, - 0x8f718371, 0x80717c71, + 0xe0704000, 0x705d0000, + 0xe0704100, 0x705d0100, + 0xe0704200, 0x705d0200, + 0xe0704300, 0x705d0300, + 0x807c847c, 0x8070ff70, + 0x00000400, 0xbf0a7b7c, + 0xbf85ffef, 0xb97b1e06, + 0x877bc17b, 0xbf84000c, + 0x8f7b837b, 0x807b7c7b, 0xbefe03c1, 0xbeff0380, 0x7e008700, 0xe0704000, - 0x7a5d0000, 0x807c817c, - 0x807aff7a, 0x00000080, - 0xbf0a717c, 0xbf85fff8, - 0xbf820142, 0xbef4037e, + 0x705d0000, 0x807c817c, + 0x8070ff70, 0x00000080, + 0xbf0a7b7c, 0xbf85fff8, + 0xbf82014f, 0xbef4037e, 0x8775ff7f, 0x0000ffff, 0x8875ff75, 0x00040000, 0xbef60380, 0xbef703ff, - 0x10807fac, 0x8772ff7f, - 0x08000000, 0x90728372, - 0x88777277, 0x8772ff7f, - 0x70000000, 0x90728172, - 0x88777277, 0xb97302dc, - 0x8f739973, 0x8873737f, - 0x8772ff7f, 0x04000000, - 0xbf840036, 0xbefe03c1, - 0x907c9973, 0x877c817c, + 0x10807fac, 0x876eff7f, + 0x08000000, 0x906e836e, + 0x88776e77, 0x876eff7f, + 0x70000000, 0x906e816e, + 0x88776e77, 0xb97202dc, + 0x8f729972, 0x8872727f, + 0x876eff7f, 0x04000000, + 0xbf840034, 0xbefe03c1, + 0x907c9972, 0x877c817c, 0xbf06817c, 0xbf850002, 0xbeff0380, 0xbf820001, 0xbeff03c1, 0xb96f4306, - 0x876fc16f, 0xbf84002b, + 0x876fc16f, 0xbf840029, 0x8f6f866f, 0x8f6f826f, 0xbef6036f, 0xb9782a05, - 0x80788178, 0x907c9973, - 0x877c817c, 0xbf06817c, + 0x80788178, 0xbf0d9972, 0xbf850002, 0x8f788978, 0xbf820001, 0x8f788a78, - 0xb9721e06, 0x8f728a72, - 0x80787278, 0x8078ff78, + 0xb96e1e06, 0x8f6e8a6e, + 0x80786e78, 0x8078ff78, 0x00000200, 0x8078ff78, 0x00000080, 0xbef603ff, - 0x01000000, 0x907c9973, + 0x01000000, 0x907c9972, 0x877c817c, 0xbf06817c, 0xbefc0380, 0xbf850009, 0xe0310000, 0x781d0000, @@ -944,15 +949,15 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0x00000100, 0x8078ff78, 0x00000100, 0xbf0a6f7c, 0xbf85fff8, 0xbef80380, - 0xbefe03c1, 0x907c9973, + 0xbefe03c1, 0x907c9972, 0x877c817c, 0xbf06817c, 0xbf850002, 0xbeff0380, 0xbf820001, 0xbeff03c1, 0xb96f2a05, 0x806f816f, - 0x8f6f826f, 0x907c9973, + 0x8f6f826f, 0x907c9972, 0x877c817c, 0xbf06817c, 0xbf850021, 0xbef603ff, - 0x01000000, 0xbef20378, + 0x01000000, 0xbeee0378, 0x8078ff78, 0x00000200, 0xbefc0384, 0xe0304000, 0x785d0000, 0xe0304080, @@ -964,12 +969,12 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0x807c847c, 0x8078ff78, 0x00000200, 0xbf0a6f7c, 0xbf85ffee, 0xe0304000, - 0x725d0000, 0xe0304080, - 0x725d0100, 0xe0304100, - 0x725d0200, 0xe0304180, - 0x725d0300, 0xbf820032, + 0x6e5d0000, 0xe0304080, + 0x6e5d0100, 0xe0304100, + 0x6e5d0200, 0xe0304180, + 0x6e5d0300, 0xbf820032, 0xbef603ff, 0x01000000, - 0xbef20378, 0x8078ff78, + 0xbeee0378, 0x8078ff78, 0x00000400, 0xbefc0384, 0xe0304000, 0x785d0000, 0xe0304100, 0x785d0100, @@ -989,16 +994,15 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0x8078ff78, 0x00000080, 0xbf0a6f7c, 0xbf85fff7, 0xbeff03c1, 0xe0304000, - 0x725d0000, 0xe0304100, - 0x725d0100, 0xe0304200, - 0x725d0200, 0xe0304300, - 0x725d0300, 0xbf8c3f70, + 0x6e5d0000, 0xe0304100, + 0x6e5d0100, 0xe0304200, + 0x6e5d0200, 0xe0304300, + 0x6e5d0300, 0xbf8c3f70, 0xb9782a05, 0x80788178, - 0x907c9973, 0x877c817c, - 0xbf06817c, 0xbf850002, + 0xbf0d9972, 0xbf850002, 0x8f788978, 0xbf820001, - 0x8f788a78, 0xb9721e06, - 0x8f728a72, 0x80787278, + 0x8f788a78, 0xb96e1e06, + 0x8f6e8a6e, 0x80786e78, 0x8078ff78, 0x00000200, 0x80f8ff78, 0x00000050, 0xbef603ff, 0x01000000, @@ -1021,23 +1025,22 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xbe8c310c, 0xbe8e310e, 0xbf06807c, 0xbf84fff0, 0xb9782a05, 0x80788178, - 0x907c9973, 0x877c817c, - 0xbf06817c, 0xbf850002, + 0xbf0d9972, 0xbf850002, 0x8f788978, 0xbf820001, - 0x8f788a78, 0xb9721e06, - 0x8f728a72, 0x80787278, + 0x8f788a78, 0xb96e1e06, + 0x8f6e8a6e, 0x80786e78, 0x8078ff78, 0x00000200, 0xbef603ff, 0x01000000, 0xf4211bfa, 0xf0000000, 0x80788478, 0xf4211b3a, 0xf0000000, 0x80788478, 0xf4211b7a, 0xf0000000, - 0x80788478, 0xf4211eba, - 0xf0000000, 0x80788478, - 0xf4211efa, 0xf0000000, 0x80788478, 0xf4211c3a, 0xf0000000, 0x80788478, 0xf4211c7a, 0xf0000000, + 0x80788478, 0xf4211eba, + 0xf0000000, 0x80788478, + 0xf4211efa, 0xf0000000, 0x80788478, 0xf4211e7a, 0xf0000000, 0x80788478, 0xf4211cfa, 0xf0000000, @@ -1046,31 +1049,41 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { 0xbf8cc07f, 0xb9eef814, 0xf4211bba, 0xf0000000, 0x80788478, 0xbf8cc07f, - 0xb9eef815, 0xbef2036d, - 0x876dff72, 0x0000ffff, - 0xbefc036f, 0xbefe037a, - 0xbeff037b, 0x876f71ff, - 0x000003ff, 0xb9ef4803, - 0xb9f9f816, 0x876f71ff, - 0xfffff800, 0x906f8b6f, - 0xb9efa2c3, 0xb9f3f801, - 0x876fff72, 0xfc000000, - 0x906f9a6f, 0x8f6f906f, - 0xbef30380, 0x88736f73, - 0x876fff72, 0x02000000, - 0x906f996f, 0x8f6f8f6f, - 0x88736f73, 0x876fff72, - 0x01000000, 0x906f986f, - 0x8f6f996f, 0x88736f73, - 0x876fff70, 0x00800000, - 0x906f976f, 0xb9f3f807, - 0x87fe7e7e, 0x87ea6a6a, - 0xb9f0f802, 0xbf8a0000, - 0xbe80226c, 0xbf810000, + 0xb9eef815, 0xbefc036f, + 0xbefe0370, 0xbeff0371, + 0x876f7bff, 0x000003ff, + 0xb9ef4803, 0xb9f9f816, + 0x876f7bff, 0xfffff800, + 0x906f8b6f, 0xb9efa2c3, + 0xb9f3f801, 0xb96e2a05, + 0x806e816e, 0xbf0d9972, + 0xbf850002, 0x8f6e896e, + 0xbf820001, 0x8f6e8a6e, + 0x806eff6e, 0x00000200, + 0x806e746e, 0x826f8075, + 0x876fff6f, 0x0000ffff, + 0xf4091c37, 0xfa000050, + 0xf4091d37, 0xfa000060, + 0xf4011e77, 0xfa000074, + 0xbf8cc07f, 0x876fff6d, + 0xfc000000, 0x906f9a6f, + 0x8f6f906f, 0xbeee0380, + 0x886e6f6e, 0x876fff6d, + 0x02000000, 0x906f996f, + 0x8f6f8f6f, 0x886e6f6e, + 0x876fff6d, 0x01000000, + 0x906f986f, 0x8f6f996f, + 0x886e6f6e, 0x876fff7a, + 0x00800000, 0x906f976f, + 0xb9eef807, 0x876dff6d, + 0x0000ffff, 0x87fe7e7e, + 0x87ea6a6a, 0xb9faf802, + 0xbf8a0000, 0xbe80226c, + 0xbf810000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, 0xbf9f0000, - 0xbf9f0000, 0x00000000, }; + static const uint32_t cwsr_trap_arcturus_hex[] = { 0xbf820001, 0xbf8202c4, 0xb8f8f802, 0x89788678, @@ -1560,3 +1573,399 @@ static const uint32_t cwsr_trap_arcturus_hex[] = { 0xbf8a0000, 0x95806f6c, 0xbf810000, 0x00000000, }; + +static const uint32_t cwsr_trap_gfx10_hex[] = { + 0xbf820001, 0xbf8201cf, + 0xb0804004, 0xb978f802, + 0x8a788678, 0xb96ef801, + 0x876eff6e, 0x00000800, + 0xbf840003, 0x876eff78, + 0x00002000, 0xbf840009, + 0xb97bf803, 0x876eff7b, + 0x00000400, 0xbf85001d, + 0x876eff7b, 0x00000100, + 0xbf840002, 0x8878ff78, + 0x00002000, 0xb97af812, + 0xb97bf813, 0x8ffa887a, + 0xf4051bbd, 0xfa000000, + 0xbf8cc07f, 0xf4051ebd, + 0xfa000008, 0xbf8cc07f, + 0x87ee6e6e, 0xbf840001, + 0xbe80206e, 0xb97bf803, + 0x877bff7b, 0x000001ff, + 0xbf850002, 0x806c846c, + 0x826d806d, 0x876dff6d, + 0x0000ffff, 0x87fe7e7e, + 0x87ea6a6a, 0xb9f8f802, + 0xbe80226c, 0x876dff6d, + 0x0000ffff, 0xbefa0380, + 0xb9fa0283, 0xbeee037e, + 0xbeef037f, 0xbefe0480, + 0xbf900004, 0xbf8cc07f, + 0xb97b02dc, 0x8f7b997b, + 0x887b7b7f, 0xb97a2a05, + 0x807a817a, 0xbf0d997b, + 0xbf850002, 0x8f7a897a, + 0xbf820001, 0x8f7a8a7a, + 0x877bff7f, 0x0000ffff, + 0x807aff7a, 0x00000200, + 0x807a7e7a, 0x827b807b, + 0xbef4037e, 0x8775ff7f, + 0x0000ffff, 0x8875ff75, + 0x00040000, 0xbef60380, + 0xbef703ff, 0x10807fac, + 0x877aff7f, 0x08000000, + 0x907a837a, 0x88777a77, + 0x877aff7f, 0x70000000, + 0x907a817a, 0x88777a77, + 0xbef1037c, 0xbef00380, + 0xb97302dc, 0x8f739973, + 0x8873737f, 0xbefe03c1, + 0x907c9973, 0x877c817c, + 0xbf06817c, 0xbf850002, + 0xbeff0380, 0xbf820002, + 0xbeff03c1, 0xbf82000b, + 0xbef603ff, 0x01000000, + 0xe0704000, 0x705d0000, + 0xe0704080, 0x705d0100, + 0xe0704100, 0x705d0200, + 0xe0704180, 0x705d0300, + 0xbf82000a, 0xbef603ff, + 0x01000000, 0xe0704000, + 0x705d0000, 0xe0704100, + 0x705d0100, 0xe0704200, + 0x705d0200, 0xe0704300, + 0x705d0300, 0xb9702a05, + 0x80708170, 0xbf0d9973, + 0xbf850002, 0x8f708970, + 0xbf820001, 0x8f708a70, + 0xb97a1e06, 0x8f7a8a7a, + 0x80707a70, 0x8070ff70, + 0x00000200, 0xbef603ff, + 0x01000000, 0x7e000280, + 0x7e020280, 0x7e040280, + 0xbefc0380, 0xd7610002, + 0x0000f871, 0x807c817c, + 0xd7610002, 0x0000f86c, + 0x807c817c, 0xd7610002, + 0x0000f86d, 0x807c817c, + 0xd7610002, 0x0000f86e, + 0x807c817c, 0xd7610002, + 0x0000f86f, 0x807c817c, + 0xd7610002, 0x0000f878, + 0x807c817c, 0xb97af803, + 0xd7610002, 0x0000f87a, + 0x807c817c, 0xd7610002, + 0x0000f87b, 0x807c817c, + 0xb971f801, 0xd7610002, + 0x0000f871, 0x807c817c, + 0xb971f814, 0xd7610002, + 0x0000f871, 0x807c817c, + 0xb971f815, 0xd7610002, + 0x0000f871, 0x807c817c, + 0xbeff0380, 0xe0704000, + 0x705d0200, 0xb9702a05, + 0x80708170, 0xbf0d9973, + 0xbf850002, 0x8f708970, + 0xbf820001, 0x8f708a70, + 0xb97a1e06, 0x8f7a8a7a, + 0x80707a70, 0xbef603ff, + 0x01000000, 0xbef90380, + 0xbefc0380, 0xbf800000, + 0xbe802f00, 0xbe822f02, + 0xbe842f04, 0xbe862f06, + 0xbe882f08, 0xbe8a2f0a, + 0xbe8c2f0c, 0xbe8e2f0e, + 0xd7610002, 0x0000f200, + 0x80798179, 0xd7610002, + 0x0000f201, 0x80798179, + 0xd7610002, 0x0000f202, + 0x80798179, 0xd7610002, + 0x0000f203, 0x80798179, + 0xd7610002, 0x0000f204, + 0x80798179, 0xd7610002, + 0x0000f205, 0x80798179, + 0xd7610002, 0x0000f206, + 0x80798179, 0xd7610002, + 0x0000f207, 0x80798179, + 0xd7610002, 0x0000f208, + 0x80798179, 0xd7610002, + 0x0000f209, 0x80798179, + 0xd7610002, 0x0000f20a, + 0x80798179, 0xd7610002, + 0x0000f20b, 0x80798179, + 0xd7610002, 0x0000f20c, + 0x80798179, 0xd7610002, + 0x0000f20d, 0x80798179, + 0xd7610002, 0x0000f20e, + 0x80798179, 0xd7610002, + 0x0000f20f, 0x80798179, + 0xbf06a079, 0xbf840006, + 0xe0704000, 0x705d0200, + 0x8070ff70, 0x00000080, + 0xbef90380, 0x7e040280, + 0x807c907c, 0xbf0aff7c, + 0x00000060, 0xbf85ffbc, + 0xbe802f00, 0xbe822f02, + 0xbe842f04, 0xbe862f06, + 0xbe882f08, 0xbe8a2f0a, + 0xd7610002, 0x0000f200, + 0x80798179, 0xd7610002, + 0x0000f201, 0x80798179, + 0xd7610002, 0x0000f202, + 0x80798179, 0xd7610002, + 0x0000f203, 0x80798179, + 0xd7610002, 0x0000f204, + 0x80798179, 0xd7610002, + 0x0000f205, 0x80798179, + 0xd7610002, 0x0000f206, + 0x80798179, 0xd7610002, + 0x0000f207, 0x80798179, + 0xd7610002, 0x0000f208, + 0x80798179, 0xd7610002, + 0x0000f209, 0x80798179, + 0xd7610002, 0x0000f20a, + 0x80798179, 0xd7610002, + 0x0000f20b, 0x80798179, + 0xe0704000, 0x705d0200, + 0xbefe03c1, 0x907c9973, + 0x877c817c, 0xbf06817c, + 0xbf850002, 0xbeff0380, + 0xbf820001, 0xbeff03c1, + 0xb97b4306, 0x877bc17b, + 0xbf840044, 0xbf8a0000, + 0x877aff73, 0x04000000, + 0xbf840040, 0x8f7b867b, + 0x8f7b827b, 0xbef6037b, + 0xb9702a05, 0x80708170, + 0xbf0d9973, 0xbf850002, + 0x8f708970, 0xbf820001, + 0x8f708a70, 0xb97a1e06, + 0x8f7a8a7a, 0x80707a70, + 0x8070ff70, 0x00000200, + 0x8070ff70, 0x00000080, + 0xbef603ff, 0x01000000, + 0xd7650000, 0x000100c1, + 0xd7660000, 0x000200c1, + 0x16000084, 0x907c9973, + 0x877c817c, 0xbf06817c, + 0xbefc0380, 0xbf850012, + 0xbe8303ff, 0x00000080, + 0xbf800000, 0xbf800000, + 0xbf800000, 0xd8d80000, + 0x01000000, 0xbf8c0000, + 0xe0704000, 0x705d0100, + 0x807c037c, 0x80700370, + 0xd5250000, 0x0001ff00, + 0x00000080, 0xbf0a7b7c, + 0xbf85fff4, 0xbf820011, + 0xbe8303ff, 0x00000100, + 0xbf800000, 0xbf800000, + 0xbf800000, 0xd8d80000, + 0x01000000, 0xbf8c0000, + 0xe0704000, 0x705d0100, + 0x807c037c, 0x80700370, + 0xd5250000, 0x0001ff00, + 0x00000100, 0xbf0a7b7c, + 0xbf85fff4, 0xbefe03c1, + 0x907c9973, 0x877c817c, + 0xbf06817c, 0xbf850004, + 0xbef003ff, 0x00000200, + 0xbeff0380, 0xbf820003, + 0xbef003ff, 0x00000400, + 0xbeff03c1, 0xb97b2a05, + 0x807b817b, 0x8f7b827b, + 0x907c9973, 0x877c817c, + 0xbf06817c, 0xbf850017, + 0xbef603ff, 0x01000000, + 0xbefc0384, 0xbf0a7b7c, + 0xbf840037, 0x7e008700, + 0x7e028701, 0x7e048702, + 0x7e068703, 0xe0704000, + 0x705d0000, 0xe0704080, + 0x705d0100, 0xe0704100, + 0x705d0200, 0xe0704180, + 0x705d0300, 0x807c847c, + 0x8070ff70, 0x00000200, + 0xbf0a7b7c, 0xbf85ffef, + 0xbf820025, 0xbef603ff, + 0x01000000, 0xbefc0384, + 0xbf0a7b7c, 0xbf840020, + 0x7e008700, 0x7e028701, + 0x7e048702, 0x7e068703, + 0xe0704000, 0x705d0000, + 0xe0704100, 0x705d0100, + 0xe0704200, 0x705d0200, + 0xe0704300, 0x705d0300, + 0x807c847c, 0x8070ff70, + 0x00000400, 0xbf0a7b7c, + 0xbf85ffef, 0xb97b1e06, + 0x877bc17b, 0xbf84000c, + 0x8f7b837b, 0x807b7c7b, + 0xbefe03c1, 0xbeff0380, + 0x7e008700, 0xe0704000, + 0x705d0000, 0x807c817c, + 0x8070ff70, 0x00000080, + 0xbf0a7b7c, 0xbf85fff8, + 0xbf82013a, 0xbef4037e, + 0x8775ff7f, 0x0000ffff, + 0x8875ff75, 0x00040000, + 0xbef60380, 0xbef703ff, + 0x10807fac, 0x876eff7f, + 0x08000000, 0x906e836e, + 0x88776e77, 0x876eff7f, + 0x70000000, 0x906e816e, + 0x88776e77, 0xb97202dc, + 0x8f729972, 0x8872727f, + 0x876eff7f, 0x04000000, + 0xbf840034, 0xbefe03c1, + 0x907c9972, 0x877c817c, + 0xbf06817c, 0xbf850002, + 0xbeff0380, 0xbf820001, + 0xbeff03c1, 0xb96f4306, + 0x876fc16f, 0xbf840029, + 0x8f6f866f, 0x8f6f826f, + 0xbef6036f, 0xb9782a05, + 0x80788178, 0xbf0d9972, + 0xbf850002, 0x8f788978, + 0xbf820001, 0x8f788a78, + 0xb96e1e06, 0x8f6e8a6e, + 0x80786e78, 0x8078ff78, + 0x00000200, 0x8078ff78, + 0x00000080, 0xbef603ff, + 0x01000000, 0x907c9972, + 0x877c817c, 0xbf06817c, + 0xbefc0380, 0xbf850009, + 0xe0310000, 0x781d0000, + 0x807cff7c, 0x00000080, + 0x8078ff78, 0x00000080, + 0xbf0a6f7c, 0xbf85fff8, + 0xbf820008, 0xe0310000, + 0x781d0000, 0x807cff7c, + 0x00000100, 0x8078ff78, + 0x00000100, 0xbf0a6f7c, + 0xbf85fff8, 0xbef80380, + 0xbefe03c1, 0x907c9972, + 0x877c817c, 0xbf06817c, + 0xbf850002, 0xbeff0380, + 0xbf820001, 0xbeff03c1, + 0xb96f2a05, 0x806f816f, + 0x8f6f826f, 0x907c9972, + 0x877c817c, 0xbf06817c, + 0xbf850021, 0xbef603ff, + 0x01000000, 0xbeee0378, + 0x8078ff78, 0x00000200, + 0xbefc0384, 0xe0304000, + 0x785d0000, 0xe0304080, + 0x785d0100, 0xe0304100, + 0x785d0200, 0xe0304180, + 0x785d0300, 0xbf8c3f70, + 0x7e008500, 0x7e028501, + 0x7e048502, 0x7e068503, + 0x807c847c, 0x8078ff78, + 0x00000200, 0xbf0a6f7c, + 0xbf85ffee, 0xe0304000, + 0x6e5d0000, 0xe0304080, + 0x6e5d0100, 0xe0304100, + 0x6e5d0200, 0xe0304180, + 0x6e5d0300, 0xbf820032, + 0xbef603ff, 0x01000000, + 0xbeee0378, 0x8078ff78, + 0x00000400, 0xbefc0384, + 0xe0304000, 0x785d0000, + 0xe0304100, 0x785d0100, + 0xe0304200, 0x785d0200, + 0xe0304300, 0x785d0300, + 0xbf8c3f70, 0x7e008500, + 0x7e028501, 0x7e048502, + 0x7e068503, 0x807c847c, + 0x8078ff78, 0x00000400, + 0xbf0a6f7c, 0xbf85ffee, + 0xb96f1e06, 0x876fc16f, + 0xbf84000e, 0x8f6f836f, + 0x806f7c6f, 0xbefe03c1, + 0xbeff0380, 0xe0304000, + 0x785d0000, 0xbf8c3f70, + 0x7e008500, 0x807c817c, + 0x8078ff78, 0x00000080, + 0xbf0a6f7c, 0xbf85fff7, + 0xbeff03c1, 0xe0304000, + 0x6e5d0000, 0xe0304100, + 0x6e5d0100, 0xe0304200, + 0x6e5d0200, 0xe0304300, + 0x6e5d0300, 0xbf8c3f70, + 0xb9782a05, 0x80788178, + 0xbf0d9972, 0xbf850002, + 0x8f788978, 0xbf820001, + 0x8f788a78, 0xb96e1e06, + 0x8f6e8a6e, 0x80786e78, + 0x8078ff78, 0x00000200, + 0x80f8ff78, 0x00000050, + 0xbef603ff, 0x01000000, + 0xbefc03ff, 0x0000006c, + 0x80f89078, 0xf429003a, + 0xf0000000, 0xbf8cc07f, + 0x80fc847c, 0xbf800000, + 0xbe803100, 0xbe823102, + 0x80f8a078, 0xf42d003a, + 0xf0000000, 0xbf8cc07f, + 0x80fc887c, 0xbf800000, + 0xbe803100, 0xbe823102, + 0xbe843104, 0xbe863106, + 0x80f8c078, 0xf431003a, + 0xf0000000, 0xbf8cc07f, + 0x80fc907c, 0xbf800000, + 0xbe803100, 0xbe823102, + 0xbe843104, 0xbe863106, + 0xbe883108, 0xbe8a310a, + 0xbe8c310c, 0xbe8e310e, + 0xbf06807c, 0xbf84fff0, + 0xb9782a05, 0x80788178, + 0xbf0d9972, 0xbf850002, + 0x8f788978, 0xbf820001, + 0x8f788a78, 0xb96e1e06, + 0x8f6e8a6e, 0x80786e78, + 0x8078ff78, 0x00000200, + 0xbef603ff, 0x01000000, + 0xf4211bfa, 0xf0000000, + 0x80788478, 0xf4211b3a, + 0xf0000000, 0x80788478, + 0xf4211b7a, 0xf0000000, + 0x80788478, 0xf4211c3a, + 0xf0000000, 0x80788478, + 0xf4211c7a, 0xf0000000, + 0x80788478, 0xf4211eba, + 0xf0000000, 0x80788478, + 0xf4211efa, 0xf0000000, + 0x80788478, 0xf4211e7a, + 0xf0000000, 0x80788478, + 0xf4211cfa, 0xf0000000, + 0x80788478, 0xf4211bba, + 0xf0000000, 0x80788478, + 0xbf8cc07f, 0xb9eef814, + 0xf4211bba, 0xf0000000, + 0x80788478, 0xbf8cc07f, + 0xb9eef815, 0xbefc036f, + 0xbefe0370, 0xbeff0371, + 0x876f7bff, 0x000003ff, + 0xb9ef4803, 0x876f7bff, + 0xfffff800, 0x906f8b6f, + 0xb9efa2c3, 0xb9f3f801, + 0xb96e2a05, 0x806e816e, + 0xbf0d9972, 0xbf850002, + 0x8f6e896e, 0xbf820001, + 0x8f6e8a6e, 0x806eff6e, + 0x00000200, 0x806e746e, + 0x826f8075, 0x876fff6f, + 0x0000ffff, 0xf4091c37, + 0xfa000050, 0xf4091d37, + 0xfa000060, 0xf4011e77, + 0xfa000074, 0xbf8cc07f, + 0x876dff6d, 0x0000ffff, + 0x87fe7e7e, 0x87ea6a6a, + 0xb9faf802, 0xbf8a0000, + 0xbe80226c, 0xbf810000, + 0xbf9f0000, 0xbf9f0000, + 0xbf9f0000, 0xbf9f0000, + 0xbf9f0000, 0x00000000, +}; diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm index 4433bda2ce25..5b220f2a7501 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm @@ -20,6 +20,21 @@ * OTHER DEALINGS IN THE SOFTWARE. */ +/* To compile this assembly code: + * + * Navi1x: + * cpp -DASIC_TARGET_NAVI1X=1 cwsr_trap_handler_gfx10.asm -P -o nv1x.sp3 + * sp3-nv1x nv1x.sp3 -hex nv1x.hex + * + * Others: + * cpp -DASIC_TARGET_NAVI1X=0 cwsr_trap_handler_gfx10.asm -P -o gfx10.sp3 + * sp3-gfx10 gfx10.sp3 -hex gfx10.hex + */ + +#define NO_SQC_STORE !ASIC_TARGET_NAVI1X + +var SINGLE_STEP_MISSED_WORKAROUND = 1 //workaround for lost MODE.DEBUG_EN exception when SAVECTX raised + var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 @@ -59,6 +74,8 @@ var SQ_WAVE_IB_STS_RCNT_SIZE = 6 var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK = 0x003F8000 var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF +var SQ_WAVE_MODE_DEBUG_EN_MASK = 0x800 + var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 @@ -96,17 +113,19 @@ var s_save_pc_hi = ttmp1 var s_save_exec_lo = ttmp2 var s_save_exec_hi = ttmp3 var s_save_status = ttmp12 -var s_save_trapsts = ttmp5 -var s_save_xnack_mask = ttmp6 +var s_save_trapsts = ttmp15 +var s_save_xnack_mask = s_save_trapsts var s_wave_size = ttmp7 var s_save_buf_rsrc0 = ttmp8 var s_save_buf_rsrc1 = ttmp9 var s_save_buf_rsrc2 = ttmp10 var s_save_buf_rsrc3 = ttmp11 -var s_save_mem_offset = ttmp14 +var s_save_mem_offset = ttmp4 var s_save_alloc_size = s_save_trapsts -var s_save_tmp = s_save_buf_rsrc2 -var s_save_m0 = ttmp15 +var s_save_tmp = ttmp14 +var s_save_m0 = ttmp5 +var s_save_ttmps_lo = s_save_tmp +var s_save_ttmps_hi = s_save_trapsts var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC @@ -128,23 +147,25 @@ var s_restore_spi_init_lo = exec_lo var s_restore_spi_init_hi = exec_hi var s_restore_mem_offset = ttmp12 var s_restore_alloc_size = ttmp3 -var s_restore_tmp = ttmp6 +var s_restore_tmp = ttmp2 var s_restore_mem_offset_save = s_restore_tmp var s_restore_m0 = s_restore_alloc_size var s_restore_mode = ttmp7 -var s_restore_flat_scratch = ttmp2 +var s_restore_flat_scratch = s_restore_tmp var s_restore_pc_lo = ttmp0 var s_restore_pc_hi = ttmp1 -var s_restore_exec_lo = ttmp14 -var s_restore_exec_hi = ttmp15 -var s_restore_status = ttmp4 -var s_restore_trapsts = ttmp5 +var s_restore_exec_lo = ttmp4 +var s_restore_exec_hi = ttmp5 +var s_restore_status = ttmp14 +var s_restore_trapsts = ttmp15 var s_restore_xnack_mask = ttmp13 var s_restore_buf_rsrc0 = ttmp8 var s_restore_buf_rsrc1 = ttmp9 var s_restore_buf_rsrc2 = ttmp10 var s_restore_buf_rsrc3 = ttmp11 -var s_restore_size = ttmp7 +var s_restore_size = ttmp6 +var s_restore_ttmps_lo = s_restore_tmp +var s_restore_ttmps_hi = s_restore_alloc_size shader main asic(DEFAULT) @@ -159,6 +180,24 @@ L_JUMP_TO_RESTORE: L_SKIP_RESTORE: s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK + +if SINGLE_STEP_MISSED_WORKAROUND + // No single step exceptions if MODE.DEBUG_EN=0. + s_getreg_b32 ttmp2, hwreg(HW_REG_MODE) + s_and_b32 ttmp2, ttmp2, SQ_WAVE_MODE_DEBUG_EN_MASK + s_cbranch_scc0 L_NO_SINGLE_STEP_WORKAROUND + + // Second-level trap already handled exception if STATUS.HALT=1. + s_and_b32 ttmp2, s_save_status, SQ_WAVE_STATUS_HALT_MASK + + // Prioritize single step exception over context save. + // Second-level trap will halt wave and RFE, re-entering for SAVECTX. + s_cbranch_scc0 L_FETCH_2ND_TRAP + +L_NO_SINGLE_STEP_WORKAROUND: +end + + s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) s_and_b32 ttmp2, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save s_cbranch_scc1 L_SAVE @@ -170,6 +209,8 @@ L_SKIP_RESTORE: s_or_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_HALT_MASK L_FETCH_2ND_TRAP: + +#if ASIC_TARGET_NAVI1X // Preserve and clear scalar XNACK state before issuing scalar loads. // Save IB_STS.REPLAY_W64H[25], RCNT[21:16], FIRST_REPLAY[15] into // unused space ttmp11[31:24]. @@ -183,6 +224,7 @@ L_FETCH_2ND_TRAP: s_or_b32 ttmp11, ttmp11, ttmp3 s_andn2_b32 ttmp2, ttmp2, (SQ_WAVE_IB_STS_REPLAY_W64H_MASK | SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK) s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 +#endif // Read second-level TBA/TMA from first-level TMA and jump if available. // ttmp[2:5] and ttmp12 can be used (others hold SPI-initialized debug data) @@ -207,6 +249,7 @@ L_NO_NEXT_TRAP: L_EXCP_CASE: s_and_b32 ttmp1, ttmp1, 0xFFFF +#if ASIC_TARGET_NAVI1X // Restore SQ_WAVE_IB_STS. s_lshr_b32 ttmp2, ttmp11, (TTMP11_SAVE_RCNT_FIRST_REPLAY_SHIFT - SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT) s_and_b32 ttmp3, ttmp2, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK @@ -214,6 +257,7 @@ L_EXCP_CASE: s_and_b32 ttmp2, ttmp2, SQ_WAVE_IB_STS_REPLAY_W64H_MASK s_or_b32 ttmp2, ttmp2, ttmp3 s_setreg_b32 hwreg(HW_REG_IB_STS), ttmp2 +#endif // Restore SQ_WAVE_STATUS. s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 @@ -223,23 +267,11 @@ L_EXCP_CASE: s_rfe_b64 [ttmp0, ttmp1] L_SAVE: - //check whether there is mem_viol - s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) - s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK - s_cbranch_scc0 L_NO_PC_REWIND - - //if so, need rewind PC assuming GDS operation gets NACKed - s_mov_b32 s_save_tmp, 0 - s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] - s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8 - s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 - -L_NO_PC_REWIND: s_mov_b32 s_save_tmp, 0 s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit - s_getreg_b32 s_save_xnack_mask, hwreg(HW_REG_SHADER_XNACK_MASK) +#if ASIC_TARGET_NAVI1X s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp @@ -253,6 +285,7 @@ L_NO_PC_REWIND: s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp +#endif /* inform SPI the readiness and wait for SPI's go signal */ s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI @@ -261,12 +294,31 @@ L_NO_PC_REWIND: s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC +#if ASIC_TARGET_NAVI1X L_SLEEP: // sleep 1 (64clk) is not enough for 8 waves per SIMD, which will cause // SQ hang, since the 7,8th wave could not get arbit to exec inst, while // other waves are stuck into the sleep-loop and waiting for wrexec!=0 s_sleep 0x2 s_cbranch_execz L_SLEEP +#else + s_waitcnt lgkmcnt(0) +#endif + + // Save trap temporaries 4-11, 13 initialized by SPI debug dispatch logic + // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40 + get_wave_size(s_save_ttmps_hi) + get_vgpr_size_bytes(s_save_ttmps_lo, s_save_ttmps_hi) + s_and_b32 s_save_ttmps_hi, s_save_spi_init_hi, 0xFFFF + s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, get_sgpr_size_bytes() + s_add_u32 s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo + s_addc_u32 s_save_ttmps_hi, s_save_ttmps_hi, 0x0 + +#if ASIC_TARGET_NAVI1X + s_store_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_save_ttmps_lo, s_save_ttmps_hi], 0x50 glc:1 + s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, s_save_ttmps_hi], 0x60 glc:1 + s_store_dword ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x74 glc:1 +#endif /* setup Resource Contants */ s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo @@ -285,9 +337,45 @@ L_SLEEP: /* global mem offset */ s_mov_b32 s_save_mem_offset, 0x0 - s_getreg_b32 s_wave_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) - s_lshl_b32 s_wave_size, s_wave_size, S_WAVE_SIZE - s_or_b32 s_wave_size, s_save_spi_init_hi, s_wave_size //share s_wave_size with exec_hi, it's at bit25 + get_wave_size(s_wave_size) + +#if ASIC_TARGET_NAVI1X + // Save and clear vector XNACK state late to free up SGPRs. + s_getreg_b32 s_save_xnack_mask, hwreg(HW_REG_SHADER_XNACK_MASK) + s_setreg_imm32_b32 hwreg(HW_REG_SHADER_XNACK_MASK), 0x0 +#endif + + /* save first 4 VGPRs, needed for SGPR save */ + s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on + s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE + s_and_b32 m0, m0, 1 + s_cmp_eq_u32 m0, 1 + s_cbranch_scc1 L_ENABLE_SAVE_4VGPR_EXEC_HI + s_mov_b32 exec_hi, 0x00000000 + s_branch L_SAVE_4VGPR_WAVE32 +L_ENABLE_SAVE_4VGPR_EXEC_HI: + s_mov_b32 exec_hi, 0xFFFFFFFF + s_branch L_SAVE_4VGPR_WAVE64 +L_SAVE_4VGPR_WAVE32: + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + // VGPR Allocated in 4-GPR granularity + + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3 + s_branch L_SAVE_HWREG + +L_SAVE_4VGPR_WAVE64: + s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes + + // VGPR Allocated in 4-GPR granularity + + buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 + buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 /* save HW registers */ @@ -300,6 +388,13 @@ L_SAVE_HWREG: s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes +#if NO_SQC_STORE + v_mov_b32 v0, 0x0 //Offset[31:0] from buffer resource + v_mov_b32 v1, 0x0 //Offset[63:32] from buffer resource + v_mov_b32 v2, 0x0 //Set of SGPRs for TCP store + s_mov_b32 m0, 0x0 //Next lane of v2 to write to +#endif + write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) write_hwreg_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset) write_hwreg_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset) @@ -307,8 +402,10 @@ L_SAVE_HWREG: write_hwreg_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset) write_hwreg_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset) - s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) - write_hwreg_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset) + s_getreg_b32 s_save_tmp, hwreg(HW_REG_TRAPSTS) + write_hwreg_to_mem(s_save_tmp, s_save_buf_rsrc0, s_save_mem_offset) + + // Not used on Sienna_Cichlid but keep layout same for debugger. write_hwreg_to_mem(s_save_xnack_mask, s_save_buf_rsrc0, s_save_mem_offset) s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) @@ -320,10 +417,11 @@ L_SAVE_HWREG: s_getreg_b32 s_save_m0, hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI) write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset) - /* the first wave in the threadgroup */ - s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK - s_mov_b32 s_save_exec_hi, 0x0 - s_or_b32 s_save_exec_hi, s_save_tmp, s_save_exec_hi // save first wave bit to s_save_exec_hi.bits[26] +#if NO_SQC_STORE + // Write HWREG/SGPRs with 32 VGPR lanes, wave32 is common case. + s_mov_b32 exec_hi, 0x0 + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 +#endif /* save SGPRs */ // Save SGPR before LDS save, then the s0 to s4 can be used during LDS save... @@ -334,10 +432,14 @@ L_SAVE_HWREG: s_add_u32 s_save_mem_offset, s_save_mem_offset, s_save_tmp s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes +#if NO_SQC_STORE + s_mov_b32 ttmp13, 0x0 //next VGPR lane to copy SGPR into +#else // backup s_save_buf_rsrc0,1 to s_save_pc_lo/hi, since write_16sgpr_to_mem function will change the rsrc0 s_mov_b32 s_save_xnack_mask, s_save_buf_rsrc0 s_add_u32 s_save_buf_rsrc0, s_save_buf_rsrc0, s_save_mem_offset s_addc_u32 s_save_buf_rsrc1, s_save_buf_rsrc1, 0 +#endif s_mov_b32 m0, 0x0 //SGPR initial index value =0 s_nop 0x0 //Manually inserted wait states @@ -353,6 +455,18 @@ L_SAVE_SGPR_LOOP: s_movrels_b64 s14, s14 //s14 = s[14+m0], s15 = s[15+m0] write_16sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) + +#if NO_SQC_STORE + s_cmp_eq_u32 ttmp13, 0x20 //have 32 VGPR lanes filled? + s_cbranch_scc0 L_SAVE_SGPR_SKIP_TCP_STORE + + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 + s_add_u32 s_save_mem_offset, s_save_mem_offset, 0x80 + s_mov_b32 ttmp13, 0x0 + v_mov_b32 v2, 0x0 +L_SAVE_SGPR_SKIP_TCP_STORE: +#endif + s_add_u32 m0, m0, 16 //next sgpr index s_cmp_lt_u32 m0, 96 //scc = (m0 < first 96 SGPR) ? 1 : 0 s_cbranch_scc1 L_SAVE_SGPR_LOOP //first 96 SGPR save is complete? @@ -366,43 +480,12 @@ L_SAVE_SGPR_LOOP: s_movrels_b64 s10, s10 //s10 = s[10+m0], s11 = s[11+m0] write_12sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset) +#if NO_SQC_STORE + buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 +#else // restore s_save_buf_rsrc0,1 s_mov_b32 s_save_buf_rsrc0, s_save_xnack_mask - - /* save first 4 VGPR, then LDS save could use */ - // each wave will alloc 4 vgprs at least... - - s_mov_b32 s_save_mem_offset, 0 - s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on - s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE - s_and_b32 m0, m0, 1 - s_cmp_eq_u32 m0, 1 - s_cbranch_scc1 L_ENABLE_SAVE_4VGPR_EXEC_HI - s_mov_b32 exec_hi, 0x00000000 - s_branch L_SAVE_4VGPR_WAVE32 -L_ENABLE_SAVE_4VGPR_EXEC_HI: - s_mov_b32 exec_hi, 0xFFFFFFFF - s_branch L_SAVE_4VGPR_WAVE64 -L_SAVE_4VGPR_WAVE32: - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - - // VGPR Allocated in 4-GPR granularity - - buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*2 - buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:128*3 - s_branch L_SAVE_LDS - -L_SAVE_4VGPR_WAVE64: - s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - - // VGPR Allocated in 4-GPR granularity - - buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 - buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 - buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 - buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 +#endif /* save LDS */ @@ -423,7 +506,7 @@ L_SAVE_LDS_NORMAL: s_cbranch_scc0 L_SAVE_LDS_DONE //no lds used? jump to L_SAVE_DONE s_barrier //LDS is used? wait for other waves in the same TG - s_and_b32 s_save_tmp, s_save_exec_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK + s_and_b32 s_save_tmp, s_wave_size, S_SAVE_SPI_INIT_FIRST_WAVE_MASK s_cbranch_scc0 L_SAVE_LDS_DONE // first wave do LDS save; @@ -598,9 +681,7 @@ L_RESTORE: s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE //determine it is wave32 or wave64 - s_getreg_b32 s_restore_size, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) - s_lshl_b32 s_restore_size, s_restore_size, S_WAVE_SIZE - s_or_b32 s_restore_size, s_restore_spi_init_hi, s_restore_size + get_wave_size(s_restore_size) s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK s_cbranch_scc0 L_RESTORE_VGPR @@ -634,7 +715,7 @@ L_RESTORE_LDS_NORMAL: s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes - s_lshr_b32 m0, s_wave_size, S_WAVE_SIZE + s_lshr_b32 m0, s_restore_size, S_WAVE_SIZE s_and_b32 m0, m0, 1 s_cmp_eq_u32 m0, 1 s_mov_b32 m0, 0x0 @@ -842,38 +923,55 @@ L_RESTORE_HWREG: s_setreg_b32 hwreg(HW_REG_SHADER_FLAT_SCRATCH_HI), s_restore_flat_scratch - s_mov_b32 s_restore_tmp, s_restore_pc_hi - s_and_b32 s_restore_pc_hi, s_restore_tmp, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS - s_mov_b32 m0, s_restore_m0 s_mov_b32 exec_lo, s_restore_exec_lo s_mov_b32 exec_hi, s_restore_exec_hi s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 + +#if ASIC_TARGET_NAVI1X s_setreg_b32 hwreg(HW_REG_SHADER_XNACK_MASK), s_restore_xnack_mask +#endif + s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode - s_and_b32 s_restore_m0, s_restore_tmp, S_SAVE_PC_HI_RCNT_MASK + + // Restore trap temporaries 4-11, 13 initialized by SPI debug dispatch logic + // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40 + get_vgpr_size_bytes(s_restore_ttmps_lo, s_restore_size) + s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, get_sgpr_size_bytes() + s_add_u32 s_restore_ttmps_lo, s_restore_ttmps_lo, s_restore_buf_rsrc0 + s_addc_u32 s_restore_ttmps_hi, s_restore_buf_rsrc1, 0x0 + s_and_b32 s_restore_ttmps_hi, s_restore_ttmps_hi, 0xFFFF + s_load_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x50 glc:1 + s_load_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x60 glc:1 + s_load_dword ttmp13, [s_restore_ttmps_lo, s_restore_ttmps_hi], 0x74 glc:1 + s_waitcnt lgkmcnt(0) + +#if ASIC_TARGET_NAVI1X + s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_RCNT_MASK s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT - s_mov_b32 s_restore_mode, 0x0 - s_or_b32 s_restore_mode, s_restore_mode, s_restore_m0 - s_and_b32 s_restore_m0, s_restore_tmp, S_SAVE_PC_HI_FIRST_REPLAY_MASK + s_mov_b32 s_restore_tmp, 0x0 + s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 + s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_FIRST_REPLAY_MASK s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT - s_or_b32 s_restore_mode, s_restore_mode, s_restore_m0 - s_and_b32 s_restore_m0, s_restore_tmp, S_SAVE_PC_HI_REPLAY_W64H_MASK + s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 + s_and_b32 s_restore_m0, s_restore_pc_hi, S_SAVE_PC_HI_REPLAY_W64H_MASK s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_REPLAY_W64H_SHIFT s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_REPLAY_W64H_SHIFT - s_or_b32 s_restore_mode, s_restore_mode, s_restore_m0 + s_or_b32 s_restore_tmp, s_restore_tmp, s_restore_m0 s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT - s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_mode + s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_tmp +#endif + s_and_b32 s_restore_pc_hi, s_restore_pc_hi, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status // SCC is included, which is changed by previous salu @@ -887,32 +985,53 @@ L_END_PGM: end function write_hwreg_to_mem(s, s_rsrc, s_mem_offset) +#if NO_SQC_STORE + // Copy into VGPR for later TCP store. + v_writelane_b32 v2, s, m0 + s_add_u32 m0, m0, 0x1 +#else s_mov_b32 exec_lo, m0 s_mov_b32 m0, s_mem_offset s_buffer_store_dword s, s_rsrc, m0 glc:1 s_add_u32 s_mem_offset, s_mem_offset, 4 s_mov_b32 m0, exec_lo +#endif end function write_16sgpr_to_mem(s, s_rsrc, s_mem_offset) +#if NO_SQC_STORE + // Copy into VGPR for later TCP store. + for var sgpr_idx = 0; sgpr_idx < 16; sgpr_idx ++ + v_writelane_b32 v2, s[sgpr_idx], ttmp13 + s_add_u32 ttmp13, ttmp13, 0x1 + end +#else s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 s_buffer_store_dwordx4 s[12], s_rsrc, 48 glc:1 s_add_u32 s_rsrc[0], s_rsrc[0], 4*16 s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 +#endif end function write_12sgpr_to_mem(s, s_rsrc, s_mem_offset) +#if NO_SQC_STORE + // Copy into VGPR for later TCP store. + for var sgpr_idx = 0; sgpr_idx < 12; sgpr_idx ++ + v_writelane_b32 v2, s[sgpr_idx], ttmp13 + s_add_u32 ttmp13, ttmp13, 0x1 + end +#else s_buffer_store_dwordx4 s[0], s_rsrc, 0 glc:1 s_buffer_store_dwordx4 s[4], s_rsrc, 16 glc:1 s_buffer_store_dwordx4 s[8], s_rsrc, 32 glc:1 s_add_u32 s_rsrc[0], s_rsrc[0], 4*12 s_addc_u32 s_rsrc[1], s_rsrc[1], 0x0 +#endif end - function read_hwreg_from_mem(s, s_rsrc, s_mem_offset) s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 s_add_u32 s_mem_offset, s_mem_offset, 4 @@ -942,9 +1061,7 @@ end function get_vgpr_size_bytes(s_vgpr_size_byte, s_size) s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 - s_lshr_b32 m0, s_size, S_WAVE_SIZE - s_and_b32 m0, m0, 1 - s_cmp_eq_u32 m0, 1 + s_bitcmp1_b32 s_size, S_WAVE_SIZE s_cbranch_scc1 L_ENABLE_SHIFT_W64 s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+7) //Number of VGPRs = (vgpr_size + 1) * 4 * 32 * 4 (non-zero value) s_branch L_SHIFT_DONE @@ -965,3 +1082,9 @@ end function get_hwreg_size_bytes return 128 end + +function get_wave_size(s_reg) + s_getreg_b32 s_reg, hwreg(HW_REG_IB_STS2,SQ_WAVE_IB_STS2_WAVE64_SHIFT,SQ_WAVE_IB_STS2_WAVE64_SIZE) + s_lshl_b32 s_reg, s_reg, S_WAVE_SIZE + s_or_b32 s_reg, s_save_spi_init_hi, s_reg //share with exec_hi, it's at bit25 +end diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index cf0017f4d9d5..e9b96ad3d9a5 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -39,6 +39,7 @@ #include "kfd_device_queue_manager.h" #include "kfd_dbgmgr.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" static long kfd_ioctl(struct file *, unsigned int, unsigned long); static int kfd_open(struct inode *, struct file *); @@ -1740,6 +1741,20 @@ err_unlock: return r; } +/* Handle requests for watching SMI events */ +static int kfd_ioctl_smi_events(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_smi_events_args *args = data; + struct kfd_dev *dev; + + dev = kfd_device_by_id(args->gpuid); + if (!dev) + return -EINVAL; + + return kfd_smi_event_open(dev, &args->anon_fd); +} + #define AMDKFD_IOCTL_DEF(ioctl, _func, _flags) \ [_IOC_NR(ioctl)] = {.cmd = ioctl, .func = _func, .flags = _flags, \ .cmd_drv = 0, .name = #ioctl} @@ -1835,6 +1850,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS, kfd_ioctl_alloc_queue_gws, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS, + kfd_ioctl_smi_events, 0), }; #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c index 1009a3b8dcc2..6a250f8fcfb8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c @@ -678,6 +678,8 @@ static int kfd_fill_gpu_cache_info(struct kfd_dev *kdev, case CHIP_NAVI10: case CHIP_NAVI12: case CHIP_NAVI14: + case CHIP_SIENNA_CICHLID: + case CHIP_NAVY_FLOUNDER: pcache_info = navi10_cache_info; num_of_cache_types = ARRAY_SIZE(navi10_cache_info); break; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 0491ab2b4a9b..4bfedaab183f 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -46,6 +46,7 @@ extern const struct kfd2kgd_calls gfx_v8_kfd2kgd; extern const struct kfd2kgd_calls gfx_v9_kfd2kgd; extern const struct kfd2kgd_calls arcturus_kfd2kgd; extern const struct kfd2kgd_calls gfx_v10_kfd2kgd; +extern const struct kfd2kgd_calls gfx_v10_3_kfd2kgd; static const struct kfd2kgd_calls *kfd2kgd_funcs[] = { #ifdef KFD_SUPPORT_IOMMU_V2 @@ -72,6 +73,8 @@ static const struct kfd2kgd_calls *kfd2kgd_funcs[] = { [CHIP_NAVI10] = &gfx_v10_kfd2kgd, [CHIP_NAVI12] = &gfx_v10_kfd2kgd, [CHIP_NAVI14] = &gfx_v10_kfd2kgd, + [CHIP_SIENNA_CICHLID] = &gfx_v10_3_kfd2kgd, + [CHIP_NAVY_FLOUNDER] = &gfx_v10_3_kfd2kgd, }; #ifdef KFD_SUPPORT_IOMMU_V2 @@ -458,6 +461,42 @@ static const struct kfd_device_info navi14_device_info = { .num_sdma_queues_per_engine = 8, }; +static const struct kfd_device_info sienna_cichlid_device_info = { + .asic_family = CHIP_SIENNA_CICHLID, + .asic_name = "sienna_cichlid", + .max_pasid_bits = 16, + .max_no_of_hqd = 24, + .doorbell_size = 8, + .ih_ring_entry_size = 8 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_v9, + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .needs_iommu_device = false, + .supports_cwsr = true, + .needs_pci_atomics = false, + .num_sdma_engines = 4, + .num_xgmi_sdma_engines = 0, + .num_sdma_queues_per_engine = 8, +}; + +static const struct kfd_device_info navy_flounder_device_info = { + .asic_family = CHIP_NAVY_FLOUNDER, + .asic_name = "navy_flounder", + .max_pasid_bits = 16, + .max_no_of_hqd = 24, + .doorbell_size = 8, + .ih_ring_entry_size = 8 * sizeof(uint32_t), + .event_interrupt_class = &event_interrupt_class_v9, + .num_of_watch_points = 4, + .mqd_size_aligned = MQD_SIZE_ALIGNED, + .needs_iommu_device = false, + .supports_cwsr = true, + .needs_pci_atomics = false, + .num_sdma_engines = 2, + .num_xgmi_sdma_engines = 0, + .num_sdma_queues_per_engine = 8, +}; + /* For each entry, [0] is regular and [1] is virtualisation device. */ static const struct kfd_device_info *kfd_supported_devices[][2] = { #ifdef KFD_SUPPORT_IOMMU_V2 @@ -480,6 +519,8 @@ static const struct kfd_device_info *kfd_supported_devices[][2] = { [CHIP_NAVI10] = {&navi10_device_info, NULL}, [CHIP_NAVI12] = {&navi12_device_info, &navi12_device_info}, [CHIP_NAVI14] = {&navi14_device_info, NULL}, + [CHIP_SIENNA_CICHLID] = {&sienna_cichlid_device_info, &sienna_cichlid_device_info}, + [CHIP_NAVY_FLOUNDER] = {&navy_flounder_device_info, &navy_flounder_device_info}, }; static int kfd_gtt_sa_init(struct kfd_dev *kfd, unsigned int buf_size, @@ -559,6 +600,10 @@ static void kfd_cwsr_init(struct kfd_dev *kfd) BUILD_BUG_ON(sizeof(cwsr_trap_gfx9_hex) > PAGE_SIZE); kfd->cwsr_isa = cwsr_trap_gfx9_hex; kfd->cwsr_isa_size = sizeof(cwsr_trap_gfx9_hex); + } else if (kfd->device_info->asic_family < CHIP_SIENNA_CICHLID) { + BUILD_BUG_ON(sizeof(cwsr_trap_nv1x_hex) > PAGE_SIZE); + kfd->cwsr_isa = cwsr_trap_nv1x_hex; + kfd->cwsr_isa_size = sizeof(cwsr_trap_nv1x_hex); } else { BUILD_BUG_ON(sizeof(cwsr_trap_gfx10_hex) > PAGE_SIZE); kfd->cwsr_isa = cwsr_trap_gfx10_hex; @@ -577,15 +622,24 @@ static int kfd_gws_init(struct kfd_dev *kfd) return 0; if (hws_gws_support - || (kfd->device_info->asic_family >= CHIP_VEGA10 + || (kfd->device_info->asic_family == CHIP_VEGA10 + && kfd->mec2_fw_version >= 0x81b3) + || (kfd->device_info->asic_family >= CHIP_VEGA12 && kfd->device_info->asic_family <= CHIP_RAVEN - && kfd->mec2_fw_version >= 0x1b3)) + && kfd->mec2_fw_version >= 0x1b3) + || (kfd->device_info->asic_family == CHIP_ARCTURUS + && kfd->mec2_fw_version >= 0x30)) ret = amdgpu_amdkfd_alloc_gws(kfd->kgd, amdgpu_amdkfd_get_num_gws(kfd->kgd), &kfd->gws); return ret; } +static void kfd_smi_init(struct kfd_dev *dev) { + INIT_LIST_HEAD(&dev->smi_clients); + spin_lock_init(&dev->smi_lock); +} + bool kgd2kfd_device_init(struct kfd_dev *kfd, struct drm_device *ddev, const struct kgd2kfd_shared_resources *gpu_resources) @@ -700,6 +754,8 @@ bool kgd2kfd_device_init(struct kfd_dev *kfd, goto kfd_topology_add_device_error; } + kfd_smi_init(kfd); + kfd->init_complete = true; dev_info(kfd_device, "added device %x:%x\n", kfd->pdev->vendor, kfd->pdev->device); @@ -910,6 +966,7 @@ int kgd2kfd_quiesce_mm(struct mm_struct *mm) if (!p) return -ESRCH; + WARN(debug_evictions, "Evicting pid %d", p->lead_thread->pid); r = kfd_process_evict_queues(p); kfd_unref_process(p); @@ -977,6 +1034,8 @@ int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, /* During process initialization eviction_work.dwork is initialized * to kfd_evict_bo_worker */ + WARN(debug_evictions, "Scheduling eviction of pid %d in %ld jiffies", + p->lead_thread->pid, delay_jiffies); schedule_delayed_work(&p->eviction_work, delay_jiffies); out: kfd_unref_process(p); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c index e9c4867abeff..e0e60b0d0669 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c @@ -137,7 +137,7 @@ void program_sh_mem_settings(struct device_queue_manager *dqm, qpd->sh_mem_bases); } -void increment_queue_count(struct device_queue_manager *dqm, +static void increment_queue_count(struct device_queue_manager *dqm, enum kfd_queue_type type) { dqm->active_queue_count++; @@ -145,7 +145,7 @@ void increment_queue_count(struct device_queue_manager *dqm, dqm->active_cp_queue_count++; } -void decrement_queue_count(struct device_queue_manager *dqm, +static void decrement_queue_count(struct device_queue_manager *dqm, enum kfd_queue_type type) { dqm->active_queue_count--; @@ -153,6 +153,30 @@ void decrement_queue_count(struct device_queue_manager *dqm, dqm->active_cp_queue_count--; } +int read_sdma_queue_counter(uint64_t q_rptr, uint64_t *val) +{ + int ret; + uint64_t tmp = 0; + + if (!val) + return -EINVAL; + /* + * SDMA activity counter is stored at queue's RPTR + 0x8 location. + */ + if (!access_ok((const void __user *)(q_rptr + + sizeof(uint64_t)), sizeof(uint64_t))) { + pr_err("Can't access sdma queue activity counter\n"); + return -EFAULT; + } + + ret = get_user(tmp, (uint64_t *)(q_rptr + sizeof(uint64_t))); + if (!ret) { + *val = tmp; + } + + return ret; +} + static int allocate_doorbell(struct qcm_process_device *qpd, struct queue *q) { struct kfd_dev *dev = qpd->dqm->dev; @@ -487,6 +511,7 @@ static int destroy_queue_nocpsch_locked(struct device_queue_manager *dqm, if (retval == -ETIME) qpd->reset_wavefronts = true; + mqd_mgr->free_mqd(mqd_mgr, q->mqd, q->mqd_mem_obj); list_del(&q->list); @@ -521,9 +546,23 @@ static int destroy_queue_nocpsch(struct device_queue_manager *dqm, struct queue *q) { int retval; + uint64_t sdma_val = 0; + struct kfd_process_device *pdd = qpd_to_pdd(qpd); + + /* Get the SDMA queue stats */ + if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) || + (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) { + retval = read_sdma_queue_counter((uint64_t)q->properties.read_ptr, + &sdma_val); + if (retval) + pr_err("Failed to read SDMA queue counter for queue: %d\n", + q->properties.queue_id); + } dqm_lock(dqm); retval = destroy_queue_nocpsch_locked(dqm, qpd, q); + if (!retval) + pdd->sdma_past_activity_counter += sdma_val; dqm_unlock(dqm); return retval; @@ -1428,6 +1467,18 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, { int retval; struct mqd_manager *mqd_mgr; + uint64_t sdma_val = 0; + struct kfd_process_device *pdd = qpd_to_pdd(qpd); + + /* Get the SDMA queue stats */ + if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) || + (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) { + retval = read_sdma_queue_counter((uint64_t)q->properties.read_ptr, + &sdma_val); + if (retval) + pr_err("Failed to read SDMA queue counter for queue: %d\n", + q->properties.queue_id); + } retval = 0; @@ -1449,10 +1500,11 @@ static int destroy_queue_cpsch(struct device_queue_manager *dqm, deallocate_doorbell(qpd, q); - if (q->properties.type == KFD_QUEUE_TYPE_SDMA) - deallocate_sdma_queue(dqm, q); - else if (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI) + if ((q->properties.type == KFD_QUEUE_TYPE_SDMA) || + (q->properties.type == KFD_QUEUE_TYPE_SDMA_XGMI)) { deallocate_sdma_queue(dqm, q); + pdd->sdma_past_activity_counter += sdma_val; + } list_del(&q->list); qpd->queue_count--; @@ -1886,6 +1938,8 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_dev *dev) case CHIP_NAVI10: case CHIP_NAVI12: case CHIP_NAVI14: + case CHIP_SIENNA_CICHLID: + case CHIP_NAVY_FLOUNDER: device_queue_manager_init_v10_navi10(&dqm->asic_ops); break; default: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h index 4afa015c69b1..49d8e324c636 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h @@ -251,4 +251,5 @@ static inline void dqm_unlock(struct device_queue_manager *dqm) mutex_unlock(&dqm->lock_hidden); } +int read_sdma_queue_counter(uint64_t q_rptr, uint64_t *val); #endif /* KFD_DEVICE_QUEUE_MANAGER_H_ */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c index 78714f9a8b11..c1166c40ac15 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_flat_memory.c @@ -415,6 +415,8 @@ int kfd_init_apertures(struct kfd_process *process) case CHIP_NAVI10: case CHIP_NAVI12: case CHIP_NAVI14: + case CHIP_SIENNA_CICHLID: + case CHIP_NAVY_FLOUNDER: kfd_init_apertures_v9(pdd, id); break; default: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c index fce6ccabe38b..241bd6ff79f4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c @@ -24,6 +24,7 @@ #include "kfd_events.h" #include "soc15_int.h" #include "kfd_device_queue_manager.h" +#include "kfd_smi_events.h" static bool event_interrupt_isr_v9(struct kfd_dev *dev, const uint32_t *ih_ring_entry, @@ -117,6 +118,7 @@ static void event_interrupt_wq_v9(struct kfd_dev *dev, info.prot_read = ring_id & 0x10; info.prot_write = ring_id & 0x20; + kfd_smi_event_update_vmfault(dev, pasid); kfd_process_vm_fault(dev->dqm, pasid); kfd_signal_vm_fault_event(dev, pasid, &info); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c index d1d68a51bfb8..18e08d82d978 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v10.c @@ -113,7 +113,7 @@ static void init_mqd(struct mqd_manager *mm, void **mqd, m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT | 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT | - 10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; + 1 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; if (q->format == KFD_QUEUE_FORMAT_AQL) { m->cp_hqd_aql_control = diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c index 48cda3073b70..3b6f5963180d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_v9.c @@ -160,7 +160,7 @@ static void init_mqd(struct mqd_manager *mm, void **mqd, m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT | 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT | - 10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; + 1 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; if (q->format == KFD_QUEUE_FORMAT_AQL) { m->cp_hqd_aql_control = diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c index a5e8ff1e5945..31799e5f3b3c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_mqd_manager_vi.c @@ -117,7 +117,7 @@ static void init_mqd(struct mqd_manager *mm, void **mqd, m->cp_hqd_quantum = 1 << CP_HQD_QUANTUM__QUANTUM_EN__SHIFT | 1 << CP_HQD_QUANTUM__QUANTUM_SCALE__SHIFT | - 10 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; + 1 << CP_HQD_QUANTUM__QUANTUM_DURATION__SHIFT; set_priority(m, q); m->cp_hqd_eop_rptr = 1 << CP_HQD_EOP_RPTR__INIT_FETCHER__SHIFT; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c index 685ca82d42fe..47ee40fbbd86 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager.c @@ -245,6 +245,8 @@ int pm_init(struct packet_manager *pm, struct device_queue_manager *dqm) case CHIP_NAVI10: case CHIP_NAVI12: case CHIP_NAVI14: + case CHIP_SIENNA_CICHLID: + case CHIP_NAVY_FLOUNDER: pm->pmf = &kfd_v9_pm_funcs; break; default: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c index bdca9dc5f118..dfaf771a42e6 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_v9.c @@ -39,7 +39,7 @@ static int pm_map_process_v9(struct packet_manager *pm, packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, sizeof(struct pm4_mes_map_process)); packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; - packet->bitfields2.process_quantum = 1; + packet->bitfields2.process_quantum = 10; packet->bitfields2.pasid = qpd->pqm->process->pasid; packet->bitfields14.gds_size = qpd->gds_size & 0x3F; packet->bitfields14.gds_size_hi = (qpd->gds_size >> 6) & 0xF; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_vi.c b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_vi.c index bed4d0ccb6b1..a852e0d7d804 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_vi.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_packet_manager_vi.c @@ -50,7 +50,7 @@ static int pm_map_process_vi(struct packet_manager *pm, uint32_t *buffer, packet->header.u32All = pm_build_pm4_header(IT_MAP_PROCESS, sizeof(struct pm4_mes_map_process)); packet->bitfields2.diq_enable = (qpd->is_debug) ? 1 : 0; - packet->bitfields2.process_quantum = 1; + packet->bitfields2.process_quantum = 10; packet->bitfields2.pasid = qpd->pqm->process->pasid; packet->bitfields3.page_table_base = qpd->page_table_base; packet->bitfields10.gds_size = qpd->gds_size; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c index 33b08ff00b50..2a07c4f2cd0d 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_pasid.c @@ -25,7 +25,7 @@ #include "amdgpu_ids.h" static unsigned int pasid_bits = 16; -static const struct kfd2kgd_calls *kfd2kgd; +static bool pasids_allocated; /* = false */ bool kfd_set_pasid_limit(unsigned int new_limit) { @@ -33,7 +33,7 @@ bool kfd_set_pasid_limit(unsigned int new_limit) return false; if (new_limit < (1U << pasid_bits)) { - if (kfd2kgd) + if (pasids_allocated) /* We've already allocated user PASIDs, too late to * change the limit */ @@ -53,32 +53,17 @@ unsigned int kfd_get_pasid_limit(void) unsigned int kfd_pasid_alloc(void) { - int r; + int r = amdgpu_pasid_alloc(pasid_bits); - /* Find the first best KFD device for calling KGD */ - if (!kfd2kgd) { - struct kfd_dev *dev = NULL; - unsigned int i = 0; - - while ((kfd_topology_enum_kfd_devices(i, &dev)) == 0) { - if (dev && dev->kfd2kgd) { - kfd2kgd = dev->kfd2kgd; - break; - } - i++; - } - - if (!kfd2kgd) - return false; + if (r > 0) { + pasids_allocated = true; + return r; } - r = amdgpu_pasid_alloc(pasid_bits); - - return r > 0 ? r : 0; + return 0; } void kfd_pasid_free(unsigned int pasid) { - if (kfd2kgd) - amdgpu_pasid_free(pasid); + amdgpu_pasid_free(pasid); } diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index fee60921fccf..6727e9de5b8b 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -97,7 +97,7 @@ * Size of the per-process TBA+TMA buffer: 2 pages * * The first page is the TBA used for the CWSR ISA code. The second - * page is used as TMA for daisy changing a user-mode trap handler. + * page is used as TMA for user-mode trap handler setup in daisy-chain mode. */ #define KFD_CWSR_TBA_TMA_SIZE (PAGE_SIZE * 2) #define KFD_CWSR_TMA_OFFSET PAGE_SIZE @@ -157,26 +157,21 @@ extern int debug_largebar; */ extern int ignore_crat; -/* - * Set sh_mem_config.retry_disable on Vega10 - */ +/* Set sh_mem_config.retry_disable on GFX v9 */ extern int amdgpu_noretry; -/* - * Halt if HWS hang is detected - */ +/* Halt if HWS hang is detected */ extern int halt_if_hws_hang; -/* - * Whether MEC FW support GWS barriers - */ +/* Whether MEC FW support GWS barriers */ extern bool hws_gws_support; -/* - * Queue preemption timeout in ms - */ +/* Queue preemption timeout in ms */ extern int queue_preemption_timeout_ms; +/* Enable eviction debug messages */ +extern bool debug_evictions; + enum cache_policy { cache_policy_coherent, cache_policy_noncoherent @@ -296,7 +291,7 @@ struct kfd_dev { /* xGMI */ uint64_t hive_id; - + /* UUID */ uint64_t unique_id; @@ -308,8 +303,12 @@ struct kfd_dev { /* Compute Profile ref. count */ atomic_t compute_profile; - /* Global GWS resource shared b/t processes*/ + /* Global GWS resource shared between processes */ void *gws; + + /* Clients watching SMI events */ + struct list_head smi_clients; + spinlock_t smi_lock; }; enum kfd_mempool { @@ -324,7 +323,7 @@ void kfd_chardev_exit(void); struct device *kfd_chardev(void); /** - * enum kfd_unmap_queues_filter + * enum kfd_unmap_queues_filter - Enum for queue filters. * * @KFD_UNMAP_QUEUES_FILTER_SINGLE_QUEUE: Preempts single queue. * @@ -343,15 +342,17 @@ enum kfd_unmap_queues_filter { }; /** - * enum kfd_queue_type + * enum kfd_queue_type - Enum for various queue types. * * @KFD_QUEUE_TYPE_COMPUTE: Regular user mode queue type. * - * @KFD_QUEUE_TYPE_SDMA: Sdma user mode queue type. + * @KFD_QUEUE_TYPE_SDMA: SDMA user mode queue type. * * @KFD_QUEUE_TYPE_HIQ: HIQ queue type. * * @KFD_QUEUE_TYPE_DIQ: DIQ queue type. + * + * @KFD_QUEUE_TYPE_SDMA_XGMI: Special SDMA queue for XGMI interface. */ enum kfd_queue_type { KFD_QUEUE_TYPE_COMPUTE, @@ -397,9 +398,9 @@ enum KFD_QUEUE_PRIORITY { * * @write_ptr: Defines the number of dwords written to the ring buffer. * - * @doorbell_ptr: This field aim is to notify the H/W of new packet written to - * the queue ring buffer. This field should be similar to write_ptr and the - * user should update this field after he updated the write_ptr. + * @doorbell_ptr: Notifies the H/W of new packet written to the queue ring + * buffer. This field should be similar to write_ptr and the user should + * update this field after updating the write_ptr. * * @doorbell_off: The doorbell offset in the doorbell pci-bar. * @@ -468,7 +469,7 @@ struct queue_properties { * * @list: Queue linked list. * - * @mqd: The queue MQD. + * @mqd: The queue MQD (memory queue descriptor). * * @mqd_mem_obj: The MQD local gpu memory object. * @@ -477,7 +478,7 @@ struct queue_properties { * @properties: The queue properties. * * @mec: Used only in no cp scheduling mode and identifies to micro engine id - * that the queue should be execute on. + * that the queue should be executed on. * * @pipe: Used only in no cp scheduling mode and identifies the queue's pipe * id. @@ -518,9 +519,6 @@ struct queue { struct kobject kobj; }; -/* - * Please read the kfd_mqd_manager.h description. - */ enum KFD_MQD_TYPE { KFD_MQD_TYPE_HIQ = 0, /* for hiq */ KFD_MQD_TYPE_CP, /* for cp queues and diq */ @@ -578,9 +576,7 @@ struct qcm_process_device { */ bool mapped_gws_queue; - /* - * All the memory management data should be here too - */ + /* All the memory management data should be here too */ uint64_t gds_context_area; /* Contains page table flags such as AMDGPU_PTE_VALID since gfx9 */ uint64_t page_table_base; @@ -630,7 +626,14 @@ enum kfd_pdd_bound { PDD_BOUND_SUSPENDED, }; -#define MAX_VRAM_FILENAME_LEN 11 +#define MAX_SYSFS_FILENAME_LEN 11 + +/* + * SDMA counter runs at 100MHz frequency. + * We display SDMA activity in microsecond granularity in sysfs. + * As a result, the divisor is 100. + */ +#define SDMA_ACTIVITY_DIVISOR 100 /* Data that is per-process-per device. */ struct kfd_process_device { @@ -678,7 +681,12 @@ struct kfd_process_device { /* VRAM usage */ uint64_t vram_usage; struct attribute attr_vram; - char vram_filename[MAX_VRAM_FILENAME_LEN]; + char vram_filename[MAX_SYSFS_FILENAME_LEN]; + + /* SDMA activity tracking */ + uint64_t sdma_past_activity_counter; + struct attribute attr_sdma; + char sdma_filename[MAX_SYSFS_FILENAME_LEN]; }; #define qpd_to_pdd(x) container_of(x, struct kfd_process_device, qpd) @@ -768,11 +776,13 @@ extern DECLARE_HASHTABLE(kfd_processes_table, KFD_PROCESS_TABLE_SIZE); extern struct srcu_struct kfd_processes_srcu; /** - * Ioctl function type. + * typedef amdkfd_ioctl_t - typedef for ioctl function pointer. + * + * @filep: pointer to file structure. + * @p: amdkfd process pointer. + * @data: pointer to arg that was copied from user. * - * \param filep pointer to file structure. - * \param p amdkfd process pointer. - * \param data pointer to arg that was copied from user. + * Return: returns ioctl completion code. */ typedef int amdkfd_ioctl_t(struct file *filep, struct kfd_process *p, void *data); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index 0e0c42e9f6a3..40695d52e9a8 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -25,6 +25,7 @@ #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> +#include <linux/mmu_context.h> #include <linux/slab.h> #include <linux/amd-iommu.h> #include <linux/notifier.h> @@ -76,6 +77,178 @@ struct kfd_procfs_tree { static struct kfd_procfs_tree procfs; +/* + * Structure for SDMA activity tracking + */ +struct kfd_sdma_activity_handler_workarea { + struct work_struct sdma_activity_work; + struct kfd_process_device *pdd; + uint64_t sdma_activity_counter; +}; + +struct temp_sdma_queue_list { + uint64_t rptr; + uint64_t sdma_val; + unsigned int queue_id; + struct list_head list; +}; + +static void kfd_sdma_activity_worker(struct work_struct *work) +{ + struct kfd_sdma_activity_handler_workarea *workarea; + struct kfd_process_device *pdd; + uint64_t val; + struct mm_struct *mm; + struct queue *q; + struct qcm_process_device *qpd; + struct device_queue_manager *dqm; + int ret = 0; + struct temp_sdma_queue_list sdma_q_list; + struct temp_sdma_queue_list *sdma_q, *next; + + workarea = container_of(work, struct kfd_sdma_activity_handler_workarea, + sdma_activity_work); + if (!workarea) + return; + + pdd = workarea->pdd; + if (!pdd) + return; + dqm = pdd->dev->dqm; + qpd = &pdd->qpd; + if (!dqm || !qpd) + return; + /* + * Total SDMA activity is current SDMA activity + past SDMA activity + * Past SDMA count is stored in pdd. + * To get the current activity counters for all active SDMA queues, + * we loop over all SDMA queues and get their counts from user-space. + * + * We cannot call get_user() with dqm_lock held as it can cause + * a circular lock dependency situation. To read the SDMA stats, + * we need to do the following: + * + * 1. Create a temporary list of SDMA queue nodes from the qpd->queues_list, + * with dqm_lock/dqm_unlock(). + * 2. Call get_user() for each node in temporary list without dqm_lock. + * Save the SDMA count for each node and also add the count to the total + * SDMA count counter. + * Its possible, during this step, a few SDMA queue nodes got deleted + * from the qpd->queues_list. + * 3. Do a second pass over qpd->queues_list to check if any nodes got deleted. + * If any node got deleted, its SDMA count would be captured in the sdma + * past activity counter. So subtract the SDMA counter stored in step 2 + * for this node from the total SDMA count. + */ + INIT_LIST_HEAD(&sdma_q_list.list); + + /* + * Create the temp list of all SDMA queues + */ + dqm_lock(dqm); + + list_for_each_entry(q, &qpd->queues_list, list) { + if ((q->properties.type != KFD_QUEUE_TYPE_SDMA) && + (q->properties.type != KFD_QUEUE_TYPE_SDMA_XGMI)) + continue; + + sdma_q = kzalloc(sizeof(struct temp_sdma_queue_list), GFP_KERNEL); + if (!sdma_q) { + dqm_unlock(dqm); + goto cleanup; + } + + INIT_LIST_HEAD(&sdma_q->list); + sdma_q->rptr = (uint64_t)q->properties.read_ptr; + sdma_q->queue_id = q->properties.queue_id; + list_add_tail(&sdma_q->list, &sdma_q_list.list); + } + + /* + * If the temp list is empty, then no SDMA queues nodes were found in + * qpd->queues_list. Return the past activity count as the total sdma + * count + */ + if (list_empty(&sdma_q_list.list)) { + workarea->sdma_activity_counter = pdd->sdma_past_activity_counter; + dqm_unlock(dqm); + return; + } + + dqm_unlock(dqm); + + /* + * Get the usage count for each SDMA queue in temp_list. + */ + mm = get_task_mm(pdd->process->lead_thread); + if (!mm) + goto cleanup; + + kthread_use_mm(mm); + + list_for_each_entry(sdma_q, &sdma_q_list.list, list) { + val = 0; + ret = read_sdma_queue_counter(sdma_q->rptr, &val); + if (ret) { + pr_debug("Failed to read SDMA queue active counter for queue id: %d", + sdma_q->queue_id); + } else { + sdma_q->sdma_val = val; + workarea->sdma_activity_counter += val; + } + } + + kthread_unuse_mm(mm); + mmput(mm); + + /* + * Do a second iteration over qpd_queues_list to check if any SDMA + * nodes got deleted while fetching SDMA counter. + */ + dqm_lock(dqm); + + workarea->sdma_activity_counter += pdd->sdma_past_activity_counter; + + list_for_each_entry(q, &qpd->queues_list, list) { + if (list_empty(&sdma_q_list.list)) + break; + + if ((q->properties.type != KFD_QUEUE_TYPE_SDMA) && + (q->properties.type != KFD_QUEUE_TYPE_SDMA_XGMI)) + continue; + + list_for_each_entry_safe(sdma_q, next, &sdma_q_list.list, list) { + if (((uint64_t)q->properties.read_ptr == sdma_q->rptr) && + (sdma_q->queue_id == q->properties.queue_id)) { + list_del(&sdma_q->list); + kfree(sdma_q); + break; + } + } + } + + dqm_unlock(dqm); + + /* + * If temp list is not empty, it implies some queues got deleted + * from qpd->queues_list during SDMA usage read. Subtract the SDMA + * count for each node from the total SDMA count. + */ + list_for_each_entry_safe(sdma_q, next, &sdma_q_list.list, list) { + workarea->sdma_activity_counter -= sdma_q->sdma_val; + list_del(&sdma_q->list); + kfree(sdma_q); + } + + return; + +cleanup: + list_for_each_entry_safe(sdma_q, next, &sdma_q_list.list, list) { + list_del(&sdma_q->list); + kfree(sdma_q); + } +} + static ssize_t kfd_procfs_show(struct kobject *kobj, struct attribute *attr, char *buffer) { @@ -87,8 +260,24 @@ static ssize_t kfd_procfs_show(struct kobject *kobj, struct attribute *attr, } else if (strncmp(attr->name, "vram_", 5) == 0) { struct kfd_process_device *pdd = container_of(attr, struct kfd_process_device, attr_vram); - if (pdd) - return snprintf(buffer, PAGE_SIZE, "%llu\n", READ_ONCE(pdd->vram_usage)); + return snprintf(buffer, PAGE_SIZE, "%llu\n", READ_ONCE(pdd->vram_usage)); + } else if (strncmp(attr->name, "sdma_", 5) == 0) { + struct kfd_process_device *pdd = container_of(attr, struct kfd_process_device, + attr_sdma); + struct kfd_sdma_activity_handler_workarea sdma_activity_work_handler; + + INIT_WORK(&sdma_activity_work_handler.sdma_activity_work, + kfd_sdma_activity_worker); + + sdma_activity_work_handler.pdd = pdd; + + schedule_work(&sdma_activity_work_handler.sdma_activity_work); + + flush_work(&sdma_activity_work_handler.sdma_activity_work); + + return snprintf(buffer, PAGE_SIZE, "%llu\n", + (sdma_activity_work_handler.sdma_activity_counter)/ + SDMA_ACTIVITY_DIVISOR); } else { pr_err("Invalid attribute"); return -EINVAL; @@ -210,7 +399,24 @@ int kfd_procfs_add_queue(struct queue *q) return 0; } -int kfd_procfs_add_vram_usage(struct kfd_process *p) +static int kfd_sysfs_create_file(struct kfd_process *p, struct attribute *attr, + char *name) +{ + int ret = 0; + + if (!p || !attr || !name) + return -EINVAL; + + attr->name = name; + attr->mode = KFD_SYSFS_FILE_MODE; + sysfs_attr_init(attr); + + ret = sysfs_create_file(p->kobj, attr); + + return ret; +} + +static int kfd_procfs_add_sysfs_files(struct kfd_process *p) { int ret = 0; struct kfd_process_device *pdd; @@ -221,17 +427,25 @@ int kfd_procfs_add_vram_usage(struct kfd_process *p) if (!p->kobj) return -EFAULT; - /* Create proc/<pid>/vram_<gpuid> file for each GPU */ + /* + * Create sysfs files for each GPU: + * - proc/<pid>/vram_<gpuid> + * - proc/<pid>/sdma_<gpuid> + */ list_for_each_entry(pdd, &p->per_device_data, per_device_list) { - snprintf(pdd->vram_filename, MAX_VRAM_FILENAME_LEN, "vram_%u", + snprintf(pdd->vram_filename, MAX_SYSFS_FILENAME_LEN, "vram_%u", pdd->dev->id); - pdd->attr_vram.name = pdd->vram_filename; - pdd->attr_vram.mode = KFD_SYSFS_FILE_MODE; - sysfs_attr_init(&pdd->attr_vram); - ret = sysfs_create_file(p->kobj, &pdd->attr_vram); + ret = kfd_sysfs_create_file(p, &pdd->attr_vram, pdd->vram_filename); if (ret) pr_warn("Creating vram usage for gpu id %d failed", (int)pdd->dev->id); + + snprintf(pdd->sdma_filename, MAX_SYSFS_FILENAME_LEN, "sdma_%u", + pdd->dev->id); + ret = kfd_sysfs_create_file(p, &pdd->attr_sdma, pdd->sdma_filename); + if (ret) + pr_warn("Creating sdma usage for gpu id %d failed", + (int)pdd->dev->id); } return ret; @@ -445,9 +659,9 @@ struct kfd_process *kfd_create_process(struct file *filep) if (!process->kobj_queues) pr_warn("Creating KFD proc/queues folder failed"); - ret = kfd_procfs_add_vram_usage(process); + ret = kfd_procfs_add_sysfs_files(process); if (ret) - pr_warn("Creating vram usage file for pid %d failed", + pr_warn("Creating sysfs usage file for pid %d failed", (int)process->lead_thread->pid); } out: @@ -598,8 +812,10 @@ static void kfd_process_wq_release(struct work_struct *work) kobject_put(p->kobj_queues); p->kobj_queues = NULL; - list_for_each_entry(pdd, &p->per_device_data, per_device_list) + list_for_each_entry(pdd, &p->per_device_data, per_device_list) { sysfs_remove_file(p->kobj, &pdd->attr_vram); + sysfs_remove_file(p->kobj, &pdd->attr_sdma); + } kobject_del(p->kobj); kobject_put(p->kobj); @@ -907,6 +1123,7 @@ struct kfd_process_device *kfd_create_process_device_data(struct kfd_dev *dev, pdd->already_dequeued = false; pdd->runtime_inuse = false; pdd->vram_usage = 0; + pdd->sdma_past_activity_counter = 0; list_add(&pdd->per_device_list, &p->per_device_data); /* Init idr used for memory handle translation */ @@ -1003,8 +1220,10 @@ struct kfd_process_device *kfd_bind_process_to_device(struct kfd_dev *dev, */ if (!pdd->runtime_inuse) { err = pm_runtime_get_sync(dev->ddev->dev); - if (err < 0) + if (err < 0) { + pm_runtime_put_autosuspend(dev->ddev->dev); return ERR_PTR(err); + } } err = kfd_iommu_bind_process_to_device(pdd); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c new file mode 100644 index 000000000000..7b348bf9df21 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c @@ -0,0 +1,227 @@ +/* + * Copyright 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <linux/poll.h> +#include <linux/wait.h> +#include <linux/anon_inodes.h> +#include <uapi/linux/kfd_ioctl.h> +#include "amdgpu_vm.h" +#include "kfd_priv.h" +#include "kfd_smi_events.h" + +struct kfd_smi_client { + struct list_head list; + struct kfifo fifo; + wait_queue_head_t wait_queue; + /* events enabled */ + uint64_t events; + struct kfd_dev *dev; + spinlock_t lock; +}; + +#define MAX_KFIFO_SIZE 1024 + +static __poll_t kfd_smi_ev_poll(struct file *, struct poll_table_struct *); +static ssize_t kfd_smi_ev_read(struct file *, char __user *, size_t, loff_t *); +static ssize_t kfd_smi_ev_write(struct file *, const char __user *, size_t, + loff_t *); +static int kfd_smi_ev_release(struct inode *, struct file *); + +static const char kfd_smi_name[] = "kfd_smi_ev"; + +static const struct file_operations kfd_smi_ev_fops = { + .owner = THIS_MODULE, + .poll = kfd_smi_ev_poll, + .read = kfd_smi_ev_read, + .write = kfd_smi_ev_write, + .release = kfd_smi_ev_release +}; + +static __poll_t kfd_smi_ev_poll(struct file *filep, + struct poll_table_struct *wait) +{ + struct kfd_smi_client *client = filep->private_data; + __poll_t mask = 0; + + poll_wait(filep, &client->wait_queue, wait); + + spin_lock(&client->lock); + if (!kfifo_is_empty(&client->fifo)) + mask = EPOLLIN | EPOLLRDNORM; + spin_unlock(&client->lock); + + return mask; +} + +static ssize_t kfd_smi_ev_read(struct file *filep, char __user *user, + size_t size, loff_t *offset) +{ + int ret; + size_t to_copy; + struct kfd_smi_client *client = filep->private_data; + unsigned char *buf; + + buf = kmalloc(MAX_KFIFO_SIZE * sizeof(*buf), GFP_KERNEL); + if (!buf) + return -ENOMEM; + + /* kfifo_to_user can sleep so we can't use spinlock protection around + * it. Instead, we kfifo out as spinlocked then copy them to the user. + */ + spin_lock(&client->lock); + to_copy = kfifo_len(&client->fifo); + if (!to_copy) { + spin_unlock(&client->lock); + ret = -EAGAIN; + goto ret_err; + } + to_copy = min3(size, sizeof(buf), to_copy); + ret = kfifo_out(&client->fifo, buf, to_copy); + spin_unlock(&client->lock); + if (ret <= 0) { + ret = -EAGAIN; + goto ret_err; + } + + ret = copy_to_user(user, buf, to_copy); + if (ret) { + ret = -EFAULT; + goto ret_err; + } + + kfree(buf); + return to_copy; + +ret_err: + kfree(buf); + return ret; +} + +static ssize_t kfd_smi_ev_write(struct file *filep, const char __user *user, + size_t size, loff_t *offset) +{ + struct kfd_smi_client *client = filep->private_data; + uint64_t events; + + if (!access_ok(user, size) || size < sizeof(events)) + return -EFAULT; + if (copy_from_user(&events, user, sizeof(events))) + return -EFAULT; + + WRITE_ONCE(client->events, events); + + return sizeof(events); +} + +static int kfd_smi_ev_release(struct inode *inode, struct file *filep) +{ + struct kfd_smi_client *client = filep->private_data; + struct kfd_dev *dev = client->dev; + + spin_lock(&dev->smi_lock); + list_del_rcu(&client->list); + spin_unlock(&dev->smi_lock); + + synchronize_rcu(); + kfifo_free(&client->fifo); + kfree(client); + + return 0; +} + +void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd; + struct amdgpu_task_info task_info; + /* VmFault msg = (hex)uint32_pid(8) + :(1) + task name(16) = 25 */ + /* 16 bytes event + 1 byte space + 25 bytes msg + 1 byte \n = 43 + */ + char fifo_in[43]; + struct kfd_smi_client *client; + int len; + + if (list_empty(&dev->smi_clients)) + return; + + memset(&task_info, 0, sizeof(struct amdgpu_task_info)); + amdgpu_vm_get_task_info(adev, pasid, &task_info); + /* Report VM faults from user applications, not retry from kernel */ + if (!task_info.pid) + return; + + len = snprintf(fifo_in, 43, "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT, + task_info.pid, task_info.task_name); + + rcu_read_lock(); + + list_for_each_entry_rcu(client, &dev->smi_clients, list) { + if (!(READ_ONCE(client->events) & KFD_SMI_EVENT_VMFAULT)) + continue; + spin_lock(&client->lock); + if (kfifo_avail(&client->fifo) >= len) { + kfifo_in(&client->fifo, fifo_in, len); + wake_up_all(&client->wait_queue); + } + else + pr_debug("smi_event(vmfault): no space left\n"); + spin_unlock(&client->lock); + } + + rcu_read_unlock(); +} + +int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd) +{ + struct kfd_smi_client *client; + int ret; + + client = kzalloc(sizeof(struct kfd_smi_client), GFP_KERNEL); + if (!client) + return -ENOMEM; + INIT_LIST_HEAD(&client->list); + + ret = kfifo_alloc(&client->fifo, MAX_KFIFO_SIZE, GFP_KERNEL); + if (ret) { + kfree(client); + return ret; + } + + ret = anon_inode_getfd(kfd_smi_name, &kfd_smi_ev_fops, (void *)client, + O_RDWR); + if (ret < 0) { + kfifo_free(&client->fifo); + kfree(client); + return ret; + } + *fd = ret; + + init_waitqueue_head(&client->wait_queue); + spin_lock_init(&client->lock); + client->events = 0; + client->dev = dev; + + spin_lock(&dev->smi_lock); + list_add_rcu(&client->list, &dev->smi_clients); + spin_unlock(&dev->smi_lock); + + return 0; +} diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h new file mode 100644 index 000000000000..a9cb218fef96 --- /dev/null +++ b/drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h @@ -0,0 +1,29 @@ +/* + * Copyright 2020 Advanced Micro Devices, Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef KFD_SMI_EVENTS_H_INCLUDED +#define KFD_SMI_EVENTS_H_INCLUDED + +int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd); +void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid); + +#endif diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c index bb77f7af2b6d..f185f6cbc05c 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_topology.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_topology.c @@ -210,39 +210,41 @@ struct kfd_topology_device *kfd_create_topology_device( } -#define sysfs_show_gen_prop(buffer, fmt, ...) \ - snprintf(buffer, PAGE_SIZE, "%s"fmt, buffer, __VA_ARGS__) -#define sysfs_show_32bit_prop(buffer, name, value) \ - sysfs_show_gen_prop(buffer, "%s %u\n", name, value) -#define sysfs_show_64bit_prop(buffer, name, value) \ - sysfs_show_gen_prop(buffer, "%s %llu\n", name, value) -#define sysfs_show_32bit_val(buffer, value) \ - sysfs_show_gen_prop(buffer, "%u\n", value) -#define sysfs_show_str_val(buffer, value) \ - sysfs_show_gen_prop(buffer, "%s\n", value) +#define sysfs_show_gen_prop(buffer, offs, fmt, ...) \ + (offs += snprintf(buffer+offs, PAGE_SIZE-offs, \ + fmt, __VA_ARGS__)) +#define sysfs_show_32bit_prop(buffer, offs, name, value) \ + sysfs_show_gen_prop(buffer, offs, "%s %u\n", name, value) +#define sysfs_show_64bit_prop(buffer, offs, name, value) \ + sysfs_show_gen_prop(buffer, offs, "%s %llu\n", name, value) +#define sysfs_show_32bit_val(buffer, offs, value) \ + sysfs_show_gen_prop(buffer, offs, "%u\n", value) +#define sysfs_show_str_val(buffer, offs, value) \ + sysfs_show_gen_prop(buffer, offs, "%s\n", value) static ssize_t sysprops_show(struct kobject *kobj, struct attribute *attr, char *buffer) { - ssize_t ret; + int offs = 0; /* Making sure that the buffer is an empty string */ buffer[0] = 0; if (attr == &sys_props.attr_genid) { - ret = sysfs_show_32bit_val(buffer, sys_props.generation_count); + sysfs_show_32bit_val(buffer, offs, + sys_props.generation_count); } else if (attr == &sys_props.attr_props) { - sysfs_show_64bit_prop(buffer, "platform_oem", - sys_props.platform_oem); - sysfs_show_64bit_prop(buffer, "platform_id", - sys_props.platform_id); - ret = sysfs_show_64bit_prop(buffer, "platform_rev", - sys_props.platform_rev); + sysfs_show_64bit_prop(buffer, offs, "platform_oem", + sys_props.platform_oem); + sysfs_show_64bit_prop(buffer, offs, "platform_id", + sys_props.platform_id); + sysfs_show_64bit_prop(buffer, offs, "platform_rev", + sys_props.platform_rev); } else { - ret = -EINVAL; + offs = -EINVAL; } - return ret; + return offs; } static void kfd_topology_kobj_release(struct kobject *kobj) @@ -262,7 +264,7 @@ static struct kobj_type sysprops_type = { static ssize_t iolink_show(struct kobject *kobj, struct attribute *attr, char *buffer) { - ssize_t ret; + int offs = 0; struct kfd_iolink_properties *iolink; /* Making sure that the buffer is an empty string */ @@ -271,21 +273,23 @@ static ssize_t iolink_show(struct kobject *kobj, struct attribute *attr, iolink = container_of(attr, struct kfd_iolink_properties, attr); if (iolink->gpu && kfd_devcgroup_check_permission(iolink->gpu)) return -EPERM; - sysfs_show_32bit_prop(buffer, "type", iolink->iolink_type); - sysfs_show_32bit_prop(buffer, "version_major", iolink->ver_maj); - sysfs_show_32bit_prop(buffer, "version_minor", iolink->ver_min); - sysfs_show_32bit_prop(buffer, "node_from", iolink->node_from); - sysfs_show_32bit_prop(buffer, "node_to", iolink->node_to); - sysfs_show_32bit_prop(buffer, "weight", iolink->weight); - sysfs_show_32bit_prop(buffer, "min_latency", iolink->min_latency); - sysfs_show_32bit_prop(buffer, "max_latency", iolink->max_latency); - sysfs_show_32bit_prop(buffer, "min_bandwidth", iolink->min_bandwidth); - sysfs_show_32bit_prop(buffer, "max_bandwidth", iolink->max_bandwidth); - sysfs_show_32bit_prop(buffer, "recommended_transfer_size", - iolink->rec_transfer_size); - ret = sysfs_show_32bit_prop(buffer, "flags", iolink->flags); - - return ret; + sysfs_show_32bit_prop(buffer, offs, "type", iolink->iolink_type); + sysfs_show_32bit_prop(buffer, offs, "version_major", iolink->ver_maj); + sysfs_show_32bit_prop(buffer, offs, "version_minor", iolink->ver_min); + sysfs_show_32bit_prop(buffer, offs, "node_from", iolink->node_from); + sysfs_show_32bit_prop(buffer, offs, "node_to", iolink->node_to); + sysfs_show_32bit_prop(buffer, offs, "weight", iolink->weight); + sysfs_show_32bit_prop(buffer, offs, "min_latency", iolink->min_latency); + sysfs_show_32bit_prop(buffer, offs, "max_latency", iolink->max_latency); + sysfs_show_32bit_prop(buffer, offs, "min_bandwidth", + iolink->min_bandwidth); + sysfs_show_32bit_prop(buffer, offs, "max_bandwidth", + iolink->max_bandwidth); + sysfs_show_32bit_prop(buffer, offs, "recommended_transfer_size", + iolink->rec_transfer_size); + sysfs_show_32bit_prop(buffer, offs, "flags", iolink->flags); + + return offs; } static const struct sysfs_ops iolink_ops = { @@ -300,7 +304,7 @@ static struct kobj_type iolink_type = { static ssize_t mem_show(struct kobject *kobj, struct attribute *attr, char *buffer) { - ssize_t ret; + int offs = 0; struct kfd_mem_properties *mem; /* Making sure that the buffer is an empty string */ @@ -309,13 +313,15 @@ static ssize_t mem_show(struct kobject *kobj, struct attribute *attr, mem = container_of(attr, struct kfd_mem_properties, attr); if (mem->gpu && kfd_devcgroup_check_permission(mem->gpu)) return -EPERM; - sysfs_show_32bit_prop(buffer, "heap_type", mem->heap_type); - sysfs_show_64bit_prop(buffer, "size_in_bytes", mem->size_in_bytes); - sysfs_show_32bit_prop(buffer, "flags", mem->flags); - sysfs_show_32bit_prop(buffer, "width", mem->width); - ret = sysfs_show_32bit_prop(buffer, "mem_clk_max", mem->mem_clk_max); - - return ret; + sysfs_show_32bit_prop(buffer, offs, "heap_type", mem->heap_type); + sysfs_show_64bit_prop(buffer, offs, "size_in_bytes", + mem->size_in_bytes); + sysfs_show_32bit_prop(buffer, offs, "flags", mem->flags); + sysfs_show_32bit_prop(buffer, offs, "width", mem->width); + sysfs_show_32bit_prop(buffer, offs, "mem_clk_max", + mem->mem_clk_max); + + return offs; } static const struct sysfs_ops mem_ops = { @@ -330,7 +336,7 @@ static struct kobj_type mem_type = { static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr, char *buffer) { - ssize_t ret; + int offs = 0; uint32_t i, j; struct kfd_cache_properties *cache; @@ -340,30 +346,27 @@ static ssize_t kfd_cache_show(struct kobject *kobj, struct attribute *attr, cache = container_of(attr, struct kfd_cache_properties, attr); if (cache->gpu && kfd_devcgroup_check_permission(cache->gpu)) return -EPERM; - sysfs_show_32bit_prop(buffer, "processor_id_low", + sysfs_show_32bit_prop(buffer, offs, "processor_id_low", cache->processor_id_low); - sysfs_show_32bit_prop(buffer, "level", cache->cache_level); - sysfs_show_32bit_prop(buffer, "size", cache->cache_size); - sysfs_show_32bit_prop(buffer, "cache_line_size", cache->cacheline_size); - sysfs_show_32bit_prop(buffer, "cache_lines_per_tag", - cache->cachelines_per_tag); - sysfs_show_32bit_prop(buffer, "association", cache->cache_assoc); - sysfs_show_32bit_prop(buffer, "latency", cache->cache_latency); - sysfs_show_32bit_prop(buffer, "type", cache->cache_type); - snprintf(buffer, PAGE_SIZE, "%ssibling_map ", buffer); + sysfs_show_32bit_prop(buffer, offs, "level", cache->cache_level); + sysfs_show_32bit_prop(buffer, offs, "size", cache->cache_size); + sysfs_show_32bit_prop(buffer, offs, "cache_line_size", + cache->cacheline_size); + sysfs_show_32bit_prop(buffer, offs, "cache_lines_per_tag", + cache->cachelines_per_tag); + sysfs_show_32bit_prop(buffer, offs, "association", cache->cache_assoc); + sysfs_show_32bit_prop(buffer, offs, "latency", cache->cache_latency); + sysfs_show_32bit_prop(buffer, offs, "type", cache->cache_type); + offs += snprintf(buffer+offs, PAGE_SIZE-offs, "sibling_map "); for (i = 0; i < CRAT_SIBLINGMAP_SIZE; i++) - for (j = 0; j < sizeof(cache->sibling_map[0])*8; j++) { + for (j = 0; j < sizeof(cache->sibling_map[0])*8; j++) /* Check each bit */ - if (cache->sibling_map[i] & (1 << j)) - ret = snprintf(buffer, PAGE_SIZE, - "%s%d%s", buffer, 1, ","); - else - ret = snprintf(buffer, PAGE_SIZE, - "%s%d%s", buffer, 0, ","); - } + offs += snprintf(buffer+offs, PAGE_SIZE-offs, "%d,", + (cache->sibling_map[i] >> j) & 1); + /* Replace the last "," with end of line */ - *(buffer + strlen(buffer) - 1) = 0xA; - return ret; + buffer[offs-1] = '\n'; + return offs; } static const struct sysfs_ops cache_ops = { @@ -385,6 +388,7 @@ struct kfd_perf_attr { static ssize_t perf_show(struct kobject *kobj, struct kobj_attribute *attrs, char *buf) { + int offs = 0; struct kfd_perf_attr *attr; buf[0] = 0; @@ -392,7 +396,7 @@ static ssize_t perf_show(struct kobject *kobj, struct kobj_attribute *attrs, if (!attr->data) /* invalid data for PMC */ return 0; else - return sysfs_show_32bit_val(buf, attr->data); + return sysfs_show_32bit_val(buf, offs, attr->data); } #define KFD_PERF_DESC(_name, _data) \ @@ -411,6 +415,7 @@ static struct kfd_perf_attr perf_attr_iommu[] = { static ssize_t node_show(struct kobject *kobj, struct attribute *attr, char *buffer) { + int offs = 0; struct kfd_topology_device *dev; uint32_t log_max_watch_addr; @@ -422,7 +427,7 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, attr_gpuid); if (dev->gpu && kfd_devcgroup_check_permission(dev->gpu)) return -EPERM; - return sysfs_show_32bit_val(buffer, dev->gpu_id); + return sysfs_show_32bit_val(buffer, offs, dev->gpu_id); } if (strcmp(attr->name, "name") == 0) { @@ -431,69 +436,69 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, if (dev->gpu && kfd_devcgroup_check_permission(dev->gpu)) return -EPERM; - return sysfs_show_str_val(buffer, dev->node_props.name); + return sysfs_show_str_val(buffer, offs, dev->node_props.name); } dev = container_of(attr, struct kfd_topology_device, attr_props); if (dev->gpu && kfd_devcgroup_check_permission(dev->gpu)) return -EPERM; - sysfs_show_32bit_prop(buffer, "cpu_cores_count", - dev->node_props.cpu_cores_count); - sysfs_show_32bit_prop(buffer, "simd_count", - dev->node_props.simd_count); - sysfs_show_32bit_prop(buffer, "mem_banks_count", - dev->node_props.mem_banks_count); - sysfs_show_32bit_prop(buffer, "caches_count", - dev->node_props.caches_count); - sysfs_show_32bit_prop(buffer, "io_links_count", - dev->node_props.io_links_count); - sysfs_show_32bit_prop(buffer, "cpu_core_id_base", - dev->node_props.cpu_core_id_base); - sysfs_show_32bit_prop(buffer, "simd_id_base", - dev->node_props.simd_id_base); - sysfs_show_32bit_prop(buffer, "max_waves_per_simd", - dev->node_props.max_waves_per_simd); - sysfs_show_32bit_prop(buffer, "lds_size_in_kb", - dev->node_props.lds_size_in_kb); - sysfs_show_32bit_prop(buffer, "gds_size_in_kb", - dev->node_props.gds_size_in_kb); - sysfs_show_32bit_prop(buffer, "num_gws", - dev->node_props.num_gws); - sysfs_show_32bit_prop(buffer, "wave_front_size", - dev->node_props.wave_front_size); - sysfs_show_32bit_prop(buffer, "array_count", - dev->node_props.array_count); - sysfs_show_32bit_prop(buffer, "simd_arrays_per_engine", - dev->node_props.simd_arrays_per_engine); - sysfs_show_32bit_prop(buffer, "cu_per_simd_array", - dev->node_props.cu_per_simd_array); - sysfs_show_32bit_prop(buffer, "simd_per_cu", - dev->node_props.simd_per_cu); - sysfs_show_32bit_prop(buffer, "max_slots_scratch_cu", - dev->node_props.max_slots_scratch_cu); - sysfs_show_32bit_prop(buffer, "vendor_id", - dev->node_props.vendor_id); - sysfs_show_32bit_prop(buffer, "device_id", - dev->node_props.device_id); - sysfs_show_32bit_prop(buffer, "location_id", - dev->node_props.location_id); - sysfs_show_32bit_prop(buffer, "domain", - dev->node_props.domain); - sysfs_show_32bit_prop(buffer, "drm_render_minor", - dev->node_props.drm_render_minor); - sysfs_show_64bit_prop(buffer, "hive_id", - dev->node_props.hive_id); - sysfs_show_32bit_prop(buffer, "num_sdma_engines", - dev->node_props.num_sdma_engines); - sysfs_show_32bit_prop(buffer, "num_sdma_xgmi_engines", - dev->node_props.num_sdma_xgmi_engines); - sysfs_show_32bit_prop(buffer, "num_sdma_queues_per_engine", - dev->node_props.num_sdma_queues_per_engine); - sysfs_show_32bit_prop(buffer, "num_cp_queues", - dev->node_props.num_cp_queues); - sysfs_show_64bit_prop(buffer, "unique_id", - dev->node_props.unique_id); + sysfs_show_32bit_prop(buffer, offs, "cpu_cores_count", + dev->node_props.cpu_cores_count); + sysfs_show_32bit_prop(buffer, offs, "simd_count", + dev->node_props.simd_count); + sysfs_show_32bit_prop(buffer, offs, "mem_banks_count", + dev->node_props.mem_banks_count); + sysfs_show_32bit_prop(buffer, offs, "caches_count", + dev->node_props.caches_count); + sysfs_show_32bit_prop(buffer, offs, "io_links_count", + dev->node_props.io_links_count); + sysfs_show_32bit_prop(buffer, offs, "cpu_core_id_base", + dev->node_props.cpu_core_id_base); + sysfs_show_32bit_prop(buffer, offs, "simd_id_base", + dev->node_props.simd_id_base); + sysfs_show_32bit_prop(buffer, offs, "max_waves_per_simd", + dev->node_props.max_waves_per_simd); + sysfs_show_32bit_prop(buffer, offs, "lds_size_in_kb", + dev->node_props.lds_size_in_kb); + sysfs_show_32bit_prop(buffer, offs, "gds_size_in_kb", + dev->node_props.gds_size_in_kb); + sysfs_show_32bit_prop(buffer, offs, "num_gws", + dev->node_props.num_gws); + sysfs_show_32bit_prop(buffer, offs, "wave_front_size", + dev->node_props.wave_front_size); + sysfs_show_32bit_prop(buffer, offs, "array_count", + dev->node_props.array_count); + sysfs_show_32bit_prop(buffer, offs, "simd_arrays_per_engine", + dev->node_props.simd_arrays_per_engine); + sysfs_show_32bit_prop(buffer, offs, "cu_per_simd_array", + dev->node_props.cu_per_simd_array); + sysfs_show_32bit_prop(buffer, offs, "simd_per_cu", + dev->node_props.simd_per_cu); + sysfs_show_32bit_prop(buffer, offs, "max_slots_scratch_cu", + dev->node_props.max_slots_scratch_cu); + sysfs_show_32bit_prop(buffer, offs, "vendor_id", + dev->node_props.vendor_id); + sysfs_show_32bit_prop(buffer, offs, "device_id", + dev->node_props.device_id); + sysfs_show_32bit_prop(buffer, offs, "location_id", + dev->node_props.location_id); + sysfs_show_32bit_prop(buffer, offs, "domain", + dev->node_props.domain); + sysfs_show_32bit_prop(buffer, offs, "drm_render_minor", + dev->node_props.drm_render_minor); + sysfs_show_64bit_prop(buffer, offs, "hive_id", + dev->node_props.hive_id); + sysfs_show_32bit_prop(buffer, offs, "num_sdma_engines", + dev->node_props.num_sdma_engines); + sysfs_show_32bit_prop(buffer, offs, "num_sdma_xgmi_engines", + dev->node_props.num_sdma_xgmi_engines); + sysfs_show_32bit_prop(buffer, offs, "num_sdma_queues_per_engine", + dev->node_props.num_sdma_queues_per_engine); + sysfs_show_32bit_prop(buffer, offs, "num_cp_queues", + dev->node_props.num_cp_queues); + sysfs_show_64bit_prop(buffer, offs, "unique_id", + dev->node_props.unique_id); if (dev->gpu) { log_max_watch_addr = @@ -513,22 +518,21 @@ static ssize_t node_show(struct kobject *kobj, struct attribute *attr, dev->node_props.capability |= HSA_CAP_AQL_QUEUE_DOUBLE_MAP; - sysfs_show_32bit_prop(buffer, "max_engine_clk_fcompute", + sysfs_show_32bit_prop(buffer, offs, "max_engine_clk_fcompute", dev->node_props.max_engine_clk_fcompute); - sysfs_show_64bit_prop(buffer, "local_mem_size", - (unsigned long long int) 0); + sysfs_show_64bit_prop(buffer, offs, "local_mem_size", 0ULL); - sysfs_show_32bit_prop(buffer, "fw_version", - dev->gpu->mec_fw_version); - sysfs_show_32bit_prop(buffer, "capability", - dev->node_props.capability); - sysfs_show_32bit_prop(buffer, "sdma_fw_version", - dev->gpu->sdma_fw_version); + sysfs_show_32bit_prop(buffer, offs, "fw_version", + dev->gpu->mec_fw_version); + sysfs_show_32bit_prop(buffer, offs, "capability", + dev->node_props.capability); + sysfs_show_32bit_prop(buffer, offs, "sdma_fw_version", + dev->gpu->sdma_fw_version); } - return sysfs_show_32bit_prop(buffer, "max_engine_clk_ccompute", - cpufreq_quick_get_max(0)/1000); + return sysfs_show_32bit_prop(buffer, offs, "max_engine_clk_ccompute", + cpufreq_quick_get_max(0)/1000); } static const struct sysfs_ops node_ops = { @@ -632,8 +636,10 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, ret = kobject_init_and_add(dev->kobj_node, &node_type, sys_props.kobj_nodes, "%d", id); - if (ret < 0) + if (ret < 0) { + kobject_put(dev->kobj_node); return ret; + } dev->kobj_mem = kobject_create_and_add("mem_banks", dev->kobj_node); if (!dev->kobj_mem) @@ -680,8 +686,10 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, return -ENOMEM; ret = kobject_init_and_add(mem->kobj, &mem_type, dev->kobj_mem, "%d", i); - if (ret < 0) + if (ret < 0) { + kobject_put(mem->kobj); return ret; + } mem->attr.name = "properties"; mem->attr.mode = KFD_SYSFS_FILE_MODE; @@ -699,8 +707,10 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, return -ENOMEM; ret = kobject_init_and_add(cache->kobj, &cache_type, dev->kobj_cache, "%d", i); - if (ret < 0) + if (ret < 0) { + kobject_put(cache->kobj); return ret; + } cache->attr.name = "properties"; cache->attr.mode = KFD_SYSFS_FILE_MODE; @@ -718,8 +728,10 @@ static int kfd_build_sysfs_node_entry(struct kfd_topology_device *dev, return -ENOMEM; ret = kobject_init_and_add(iolink->kobj, &iolink_type, dev->kobj_iolink, "%d", i); - if (ret < 0) + if (ret < 0) { + kobject_put(iolink->kobj); return ret; + } iolink->attr.name = "properties"; iolink->attr.mode = KFD_SYSFS_FILE_MODE; @@ -798,8 +810,10 @@ static int kfd_topology_update_sysfs(void) ret = kobject_init_and_add(sys_props.kobj_topology, &sysprops_type, &kfd_device->kobj, "topology"); - if (ret < 0) + if (ret < 0) { + kobject_put(sys_props.kobj_topology); return ret; + } sys_props.kobj_nodes = kobject_create_and_add("nodes", sys_props.kobj_topology); @@ -1359,6 +1373,8 @@ int kfd_topology_add_device(struct kfd_dev *gpu) case CHIP_NAVI10: case CHIP_NAVI12: case CHIP_NAVI14: + case CHIP_SIENNA_CICHLID: + case CHIP_NAVY_FLOUNDER: dev->node_props.capability |= ((HSA_CAP_DOORBELL_TYPE_2_0 << HSA_CAP_DOORBELL_TYPE_TOTALBITS_SHIFT) & HSA_CAP_DOORBELL_TYPE_TOTALBITS_MASK); |