aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/misc/habanalabs/gaudi/gaudi.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/misc/habanalabs/gaudi/gaudi.c')
-rw-r--r--drivers/misc/habanalabs/gaudi/gaudi.c313
1 files changed, 221 insertions, 92 deletions
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 825737dfe381..013c6da2e3ca 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * Copyright 2016-2020 HabanaLabs, Ltd.
+ * Copyright 2016-2021 HabanaLabs, Ltd.
* All Rights Reserved.
*/
@@ -593,26 +593,27 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
else
prop->mmu_pgt_size = MMU_PAGE_TABLES_SIZE;
prop->mmu_pte_size = HL_PTE_SIZE;
- prop->mmu_hop_table_size = HOP_TABLE_SIZE;
- prop->mmu_hop0_tables_total_size = HOP0_TABLES_TOTAL_SIZE;
+ prop->mmu_hop_table_size = HOP_TABLE_SIZE_512_PTE;
+ prop->mmu_hop0_tables_total_size = HOP0_512_PTE_TABLES_TOTAL_SIZE;
prop->dram_page_size = PAGE_SIZE_2MB;
prop->dram_supports_virtual_memory = false;
- prop->pmmu.hop0_shift = HOP0_SHIFT;
- prop->pmmu.hop1_shift = HOP1_SHIFT;
- prop->pmmu.hop2_shift = HOP2_SHIFT;
- prop->pmmu.hop3_shift = HOP3_SHIFT;
- prop->pmmu.hop4_shift = HOP4_SHIFT;
- prop->pmmu.hop0_mask = HOP0_MASK;
- prop->pmmu.hop1_mask = HOP1_MASK;
- prop->pmmu.hop2_mask = HOP2_MASK;
- prop->pmmu.hop3_mask = HOP3_MASK;
- prop->pmmu.hop4_mask = HOP4_MASK;
+ prop->pmmu.hop0_shift = MMU_V1_1_HOP0_SHIFT;
+ prop->pmmu.hop1_shift = MMU_V1_1_HOP1_SHIFT;
+ prop->pmmu.hop2_shift = MMU_V1_1_HOP2_SHIFT;
+ prop->pmmu.hop3_shift = MMU_V1_1_HOP3_SHIFT;
+ prop->pmmu.hop4_shift = MMU_V1_1_HOP4_SHIFT;
+ prop->pmmu.hop0_mask = MMU_V1_1_HOP0_MASK;
+ prop->pmmu.hop1_mask = MMU_V1_1_HOP1_MASK;
+ prop->pmmu.hop2_mask = MMU_V1_1_HOP2_MASK;
+ prop->pmmu.hop3_mask = MMU_V1_1_HOP3_MASK;
+ prop->pmmu.hop4_mask = MMU_V1_1_HOP4_MASK;
prop->pmmu.start_addr = VA_HOST_SPACE_START;
prop->pmmu.end_addr =
(VA_HOST_SPACE_START + VA_HOST_SPACE_SIZE / 2) - 1;
prop->pmmu.page_size = PAGE_SIZE_4KB;
prop->pmmu.num_hops = MMU_ARCH_5_HOPS;
+ prop->pmmu.last_mask = LAST_MASK;
/* PMMU and HPMMU are the same except of page size */
memcpy(&prop->pmmu_huge, &prop->pmmu, sizeof(prop->pmmu));
@@ -664,6 +665,8 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
prop->clk_pll_index = HL_GAUDI_MME_PLL;
prop->max_freq_value = GAUDI_MAX_CLK_FREQ;
+ prop->use_get_power_for_reset_history = true;
+
return 0;
}
@@ -878,6 +881,11 @@ static int gaudi_fetch_psoc_frequency(struct hl_device *hdev)
int rc;
if (hdev->asic_prop.fw_security_enabled) {
+ struct gaudi_device *gaudi = hdev->asic_specific;
+
+ if (!(gaudi->hw_cap_initialized & HW_CAP_CPU_Q))
+ return 0;
+
rc = hl_fw_cpucp_pll_info_get(hdev, HL_GAUDI_CPU_PLL, pll_freq_arr);
if (rc)
@@ -1273,6 +1281,7 @@ static int gaudi_collective_wait_init_cs(struct hl_cs *cs)
container_of(cs->signal_fence, struct hl_cs_compl, base_fence);
struct hl_cs_compl *cs_cmpl =
container_of(cs->fence, struct hl_cs_compl, base_fence);
+ struct hl_cs_encaps_sig_handle *handle = cs->encaps_sig_hdl;
struct gaudi_collective_properties *cprop;
u32 stream, queue_id, sob_group_offset;
struct gaudi_device *gaudi;
@@ -1285,10 +1294,16 @@ static int gaudi_collective_wait_init_cs(struct hl_cs *cs)
gaudi = hdev->asic_specific;
cprop = &gaudi->collective_props;
- /* In encaps signals case the SOB info will be retrieved from
- * the handle in gaudi_collective_slave_init_job.
- */
- if (!cs->encaps_signals) {
+ if (cs->encaps_signals) {
+ cs_cmpl->hw_sob = handle->hw_sob;
+ /* at this checkpoint we only need the hw_sob pointer
+ * for the completion check before start going over the jobs
+ * of the master/slaves, the sob_value will be taken later on
+ * in gaudi_collective_slave_init_job depends on each
+ * job wait offset value.
+ */
+ cs_cmpl->sob_val = 0;
+ } else {
/* copy the SOB id and value of the signal CS */
cs_cmpl->hw_sob = signal_cs_cmpl->hw_sob;
cs_cmpl->sob_val = signal_cs_cmpl->sob_val;
@@ -1621,6 +1636,8 @@ static int gaudi_late_init(struct hl_device *hdev)
*/
gaudi_mmu_prepare(hdev, 1);
+ hdev->asic_funcs->set_pll_profile(hdev, PLL_LAST);
+
return 0;
disable_pci_access:
@@ -4006,7 +4023,7 @@ static void gaudi_init_firmware_loader(struct hl_device *hdev)
struct fw_load_mgr *fw_loader = &hdev->fw_loader;
/* fill common fields */
- fw_loader->linux_loaded = false;
+ fw_loader->fw_comp_loaded = FW_TYPE_NONE;
fw_loader->boot_fit_img.image_name = GAUDI_BOOT_FIT_FILE;
fw_loader->linux_img.image_name = GAUDI_LINUX_FW_FILE;
fw_loader->cpu_timeout = GAUDI_CPU_TIMEOUT_USEC;
@@ -4289,13 +4306,31 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset
* via the GIC. Otherwise, we need to use COMMS or the MSG_TO_CPU
* registers in case of old F/Ws
*/
- if (hdev->fw_loader.linux_loaded) {
+ if (hdev->fw_loader.fw_comp_loaded & FW_TYPE_LINUX) {
irq_handler_offset = hdev->asic_prop.gic_interrupts_enable ?
mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR :
le32_to_cpu(dyn_regs->gic_host_halt_irq);
WREG32(irq_handler_offset,
gaudi_irq_map_table[GAUDI_EVENT_HALT_MACHINE].cpu_id);
+
+ /* This is a hail-mary attempt to revive the card in the small chance that the
+ * f/w has experienced a watchdog event, which caused it to return back to preboot.
+ * In that case, triggering reset through GIC won't help. We need to trigger the
+ * reset as if Linux wasn't loaded.
+ *
+ * We do it only if the reset cause was HB, because that would be the indication
+ * of such an event.
+ *
+ * In case watchdog hasn't expired but we still got HB, then this won't do any
+ * damage.
+ */
+ if (hdev->reset_info.curr_reset_cause == HL_RESET_CAUSE_HEARTBEAT) {
+ if (hdev->asic_prop.hard_reset_done_by_fw)
+ hl_fw_ask_hard_reset_without_linux(hdev);
+ else
+ hl_fw_ask_halt_machine_without_linux(hdev);
+ }
} else {
if (hdev->asic_prop.hard_reset_done_by_fw)
hl_fw_ask_hard_reset_without_linux(hdev);
@@ -6412,6 +6447,7 @@ static int gaudi_debugfs_read_dma(struct hl_device *hdev, u64 addr, u32 size,
{
u32 dma_core_sts0, err_cause, cfg1, size_left, pos, size_to_dma;
struct gaudi_device *gaudi = hdev->asic_specific;
+ u32 qm_glbl_sts0, qm_cgm_sts;
u64 dma_offset, qm_offset;
dma_addr_t dma_addr;
void *kernel_addr;
@@ -6436,14 +6472,20 @@ static int gaudi_debugfs_read_dma(struct hl_device *hdev, u64 addr, u32 size,
dma_offset = dma_id * DMA_CORE_OFFSET;
qm_offset = dma_id * DMA_QMAN_OFFSET;
dma_core_sts0 = RREG32(mmDMA0_CORE_STS0 + dma_offset);
- is_eng_idle = IS_DMA_IDLE(dma_core_sts0);
+ qm_glbl_sts0 = RREG32(mmDMA0_QM_GLBL_STS0 + qm_offset);
+ qm_cgm_sts = RREG32(mmDMA0_QM_CGM_STS + qm_offset);
+ is_eng_idle = IS_QM_IDLE(qm_glbl_sts0, qm_cgm_sts) &&
+ IS_DMA_IDLE(dma_core_sts0);
if (!is_eng_idle) {
dma_id = gaudi_dma_assignment[GAUDI_PCI_DMA_2];
dma_offset = dma_id * DMA_CORE_OFFSET;
qm_offset = dma_id * DMA_QMAN_OFFSET;
dma_core_sts0 = RREG32(mmDMA0_CORE_STS0 + dma_offset);
- is_eng_idle = IS_DMA_IDLE(dma_core_sts0);
+ qm_glbl_sts0 = RREG32(mmDMA0_QM_GLBL_STS0 + qm_offset);
+ qm_cgm_sts = RREG32(mmDMA0_QM_CGM_STS + qm_offset);
+ is_eng_idle = IS_QM_IDLE(qm_glbl_sts0, qm_cgm_sts) &&
+ IS_DMA_IDLE(dma_core_sts0);
if (!is_eng_idle) {
dev_err_ratelimited(hdev->dev,
@@ -6522,7 +6564,7 @@ static u64 gaudi_read_pte(struct hl_device *hdev, u64 addr)
{
struct gaudi_device *gaudi = hdev->asic_specific;
- if (hdev->hard_reset_pending)
+ if (hdev->reset_info.hard_reset_pending)
return U64_MAX;
return readq(hdev->pcie_bar[HBM_BAR_ID] +
@@ -6533,7 +6575,7 @@ static void gaudi_write_pte(struct hl_device *hdev, u64 addr, u64 val)
{
struct gaudi_device *gaudi = hdev->asic_specific;
- if (hdev->hard_reset_pending)
+ if (hdev->reset_info.hard_reset_pending)
return;
writeq(val, hdev->pcie_bar[HBM_BAR_ID] +
@@ -6935,8 +6977,9 @@ event_not_supported:
snprintf(desc, size, "N/A");
}
-static const char *gaudi_get_razwi_initiator_dma_name(struct hl_device *hdev,
- u32 x_y, bool is_write)
+static const char *gaudi_get_razwi_initiator_dma_name(struct hl_device *hdev, u32 x_y,
+ bool is_write, s32 *engine_id_1,
+ s32 *engine_id_2)
{
u32 dma_id[2], dma_offset, err_cause[2], mask, i;
@@ -6976,44 +7019,64 @@ static const char *gaudi_get_razwi_initiator_dma_name(struct hl_device *hdev,
switch (x_y) {
case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_S_0:
case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_S_1:
- if ((err_cause[0] & mask) && !(err_cause[1] & mask))
+ if ((err_cause[0] & mask) && !(err_cause[1] & mask)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_DMA_0;
return "DMA0";
- else if (!(err_cause[0] & mask) && (err_cause[1] & mask))
+ } else if (!(err_cause[0] & mask) && (err_cause[1] & mask)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_DMA_2;
return "DMA2";
- else
+ } else {
+ *engine_id_1 = GAUDI_ENGINE_ID_DMA_0;
+ *engine_id_2 = GAUDI_ENGINE_ID_DMA_2;
return "DMA0 or DMA2";
+ }
case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_S_0:
case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_S_1:
- if ((err_cause[0] & mask) && !(err_cause[1] & mask))
+ if ((err_cause[0] & mask) && !(err_cause[1] & mask)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_DMA_1;
return "DMA1";
- else if (!(err_cause[0] & mask) && (err_cause[1] & mask))
+ } else if (!(err_cause[0] & mask) && (err_cause[1] & mask)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_DMA_3;
return "DMA3";
- else
+ } else {
+ *engine_id_1 = GAUDI_ENGINE_ID_DMA_1;
+ *engine_id_2 = GAUDI_ENGINE_ID_DMA_3;
return "DMA1 or DMA3";
+ }
case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_N_0:
case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_N_1:
- if ((err_cause[0] & mask) && !(err_cause[1] & mask))
+ if ((err_cause[0] & mask) && !(err_cause[1] & mask)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_DMA_4;
return "DMA4";
- else if (!(err_cause[0] & mask) && (err_cause[1] & mask))
+ } else if (!(err_cause[0] & mask) && (err_cause[1] & mask)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_DMA_6;
return "DMA6";
- else
+ } else {
+ *engine_id_1 = GAUDI_ENGINE_ID_DMA_4;
+ *engine_id_2 = GAUDI_ENGINE_ID_DMA_6;
return "DMA4 or DMA6";
+ }
case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_N_0:
case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_N_1:
- if ((err_cause[0] & mask) && !(err_cause[1] & mask))
+ if ((err_cause[0] & mask) && !(err_cause[1] & mask)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_DMA_5;
return "DMA5";
- else if (!(err_cause[0] & mask) && (err_cause[1] & mask))
+ } else if (!(err_cause[0] & mask) && (err_cause[1] & mask)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_DMA_7;
return "DMA7";
- else
+ } else {
+ *engine_id_1 = GAUDI_ENGINE_ID_DMA_5;
+ *engine_id_2 = GAUDI_ENGINE_ID_DMA_7;
return "DMA5 or DMA7";
+ }
}
unknown_initiator:
return "unknown initiator";
}
-static const char *gaudi_get_razwi_initiator_name(struct hl_device *hdev,
- bool is_write)
+static const char *gaudi_get_razwi_initiator_name(struct hl_device *hdev, bool is_write,
+ u32 *engine_id_1, u32 *engine_id_2)
{
u32 val, x_y, axi_id;
@@ -7026,24 +7089,35 @@ static const char *gaudi_get_razwi_initiator_name(struct hl_device *hdev,
switch (x_y) {
case RAZWI_INITIATOR_ID_X_Y_TPC0_NIC0:
- if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC))
+ if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_TPC_0;
return "TPC0";
- if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC))
+ }
+ if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_NIC_0;
return "NIC0";
+ }
break;
case RAZWI_INITIATOR_ID_X_Y_TPC1:
+ *engine_id_1 = GAUDI_ENGINE_ID_TPC_1;
return "TPC1";
case RAZWI_INITIATOR_ID_X_Y_MME0_0:
case RAZWI_INITIATOR_ID_X_Y_MME0_1:
+ *engine_id_1 = GAUDI_ENGINE_ID_MME_0;
return "MME0";
case RAZWI_INITIATOR_ID_X_Y_MME1_0:
case RAZWI_INITIATOR_ID_X_Y_MME1_1:
+ *engine_id_1 = GAUDI_ENGINE_ID_MME_1;
return "MME1";
case RAZWI_INITIATOR_ID_X_Y_TPC2:
+ *engine_id_1 = GAUDI_ENGINE_ID_TPC_2;
return "TPC2";
case RAZWI_INITIATOR_ID_X_Y_TPC3_PCI_CPU_PSOC:
- if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC))
+ if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_TPC_3;
return "TPC3";
+ }
+ /* PCI, CPU or PSOC does not have engine id*/
if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_PCI))
return "PCI";
if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_CPU))
@@ -7059,32 +7133,49 @@ static const char *gaudi_get_razwi_initiator_name(struct hl_device *hdev,
case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_N_1:
case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_N_0:
case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_N_1:
- return gaudi_get_razwi_initiator_dma_name(hdev, x_y, is_write);
+ return gaudi_get_razwi_initiator_dma_name(hdev, x_y, is_write,
+ engine_id_1, engine_id_2);
case RAZWI_INITIATOR_ID_X_Y_TPC4_NIC1_NIC2:
- if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC))
+ if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_TPC_4;
return "TPC4";
- if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC))
+ }
+ if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_NIC_1;
return "NIC1";
- if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC_FT))
+ }
+ if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC_FT)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_NIC_2;
return "NIC2";
+ }
break;
case RAZWI_INITIATOR_ID_X_Y_TPC5:
+ *engine_id_1 = GAUDI_ENGINE_ID_TPC_5;
return "TPC5";
case RAZWI_INITIATOR_ID_X_Y_MME2_0:
case RAZWI_INITIATOR_ID_X_Y_MME2_1:
+ *engine_id_1 = GAUDI_ENGINE_ID_MME_2;
return "MME2";
case RAZWI_INITIATOR_ID_X_Y_MME3_0:
case RAZWI_INITIATOR_ID_X_Y_MME3_1:
+ *engine_id_1 = GAUDI_ENGINE_ID_MME_3;
return "MME3";
case RAZWI_INITIATOR_ID_X_Y_TPC6:
+ *engine_id_1 = GAUDI_ENGINE_ID_TPC_6;
return "TPC6";
case RAZWI_INITIATOR_ID_X_Y_TPC7_NIC4_NIC5:
- if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC))
+ if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_TPC_7;
return "TPC7";
- if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC))
+ }
+ if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_NIC_4;
return "NIC4";
- if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC_FT))
+ }
+ if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC_FT)) {
+ *engine_id_1 = GAUDI_ENGINE_ID_NIC_5;
return "NIC5";
+ }
break;
default:
break;
@@ -7101,27 +7192,28 @@ static const char *gaudi_get_razwi_initiator_name(struct hl_device *hdev,
return "unknown initiator";
}
-static void gaudi_print_razwi_info(struct hl_device *hdev)
+static void gaudi_print_and_get_razwi_info(struct hl_device *hdev, u32 *engine_id_1,
+ u32 *engine_id_2)
{
+
if (RREG32(mmMMU_UP_RAZWI_WRITE_VLD)) {
dev_err_ratelimited(hdev->dev,
"RAZWI event caused by illegal write of %s\n",
- gaudi_get_razwi_initiator_name(hdev, true));
+ gaudi_get_razwi_initiator_name(hdev, true, engine_id_1, engine_id_2));
WREG32(mmMMU_UP_RAZWI_WRITE_VLD, 0);
}
if (RREG32(mmMMU_UP_RAZWI_READ_VLD)) {
dev_err_ratelimited(hdev->dev,
"RAZWI event caused by illegal read of %s\n",
- gaudi_get_razwi_initiator_name(hdev, false));
+ gaudi_get_razwi_initiator_name(hdev, false, engine_id_1, engine_id_2));
WREG32(mmMMU_UP_RAZWI_READ_VLD, 0);
}
}
-static void gaudi_print_mmu_error_info(struct hl_device *hdev)
+static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr, u8 *type)
{
struct gaudi_device *gaudi = hdev->asic_specific;
- u64 addr;
u32 val;
if (!(gaudi->hw_cap_initialized & HW_CAP_MMU))
@@ -7129,24 +7221,24 @@ static void gaudi_print_mmu_error_info(struct hl_device *hdev)
val = RREG32(mmMMU_UP_PAGE_ERROR_CAPTURE);
if (val & MMU_UP_PAGE_ERROR_CAPTURE_ENTRY_VALID_MASK) {
- addr = val & MMU_UP_PAGE_ERROR_CAPTURE_VA_49_32_MASK;
- addr <<= 32;
- addr |= RREG32(mmMMU_UP_PAGE_ERROR_CAPTURE_VA);
+ *addr = val & MMU_UP_PAGE_ERROR_CAPTURE_VA_49_32_MASK;
+ *addr <<= 32;
+ *addr |= RREG32(mmMMU_UP_PAGE_ERROR_CAPTURE_VA);
- dev_err_ratelimited(hdev->dev, "MMU page fault on va 0x%llx\n",
- addr);
+ dev_err_ratelimited(hdev->dev, "MMU page fault on va 0x%llx\n", *addr);
+ *type = HL_RAZWI_PAGE_FAULT;
WREG32(mmMMU_UP_PAGE_ERROR_CAPTURE, 0);
}
val = RREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE);
if (val & MMU_UP_ACCESS_ERROR_CAPTURE_ENTRY_VALID_MASK) {
- addr = val & MMU_UP_ACCESS_ERROR_CAPTURE_VA_49_32_MASK;
- addr <<= 32;
- addr |= RREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE_VA);
+ *addr = val & MMU_UP_ACCESS_ERROR_CAPTURE_VA_49_32_MASK;
+ *addr <<= 32;
+ *addr |= RREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE_VA);
- dev_err_ratelimited(hdev->dev,
- "MMU access error on va 0x%llx\n", addr);
+ dev_err_ratelimited(hdev->dev, "MMU access error on va 0x%llx\n", *addr);
+ *type = HL_RAZWI_MMU_ACCESS_ERROR;
WREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE, 0);
}
@@ -7665,15 +7757,46 @@ static void gaudi_handle_qman_err(struct hl_device *hdev, u16 event_type)
static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,
bool razwi)
{
+ u32 engine_id_1, engine_id_2;
char desc[64] = "";
+ u64 razwi_addr = 0;
+ u8 razwi_type;
+ int rc;
+
+ /*
+ * Init engine id by default as not valid and only if razwi initiated from engine with
+ * engine id it will get valid value.
+ * Init razwi type to default, will be changed only if razwi caused by page fault of
+ * MMU access error
+ */
+ engine_id_1 = U16_MAX;
+ engine_id_2 = U16_MAX;
+ razwi_type = U8_MAX;
gaudi_get_event_desc(event_type, desc, sizeof(desc));
dev_err_ratelimited(hdev->dev, "Received H/W interrupt %d [\"%s\"]\n",
event_type, desc);
if (razwi) {
- gaudi_print_razwi_info(hdev);
- gaudi_print_mmu_error_info(hdev);
+ gaudi_print_and_get_razwi_info(hdev, &engine_id_1, &engine_id_2);
+ gaudi_print_and_get_mmu_error_info(hdev, &razwi_addr, &razwi_type);
+
+ /* In case it's the first razwi, save its parameters*/
+ rc = atomic_cmpxchg(&hdev->last_error.razwi_write_disable, 0, 1);
+ if (!rc) {
+ hdev->last_error.open_dev_timestamp = hdev->last_successful_open_ktime;
+ hdev->last_error.razwi_timestamp = ktime_get();
+ hdev->last_error.razwi_addr = razwi_addr;
+ hdev->last_error.razwi_engine_id_1 = engine_id_1;
+ hdev->last_error.razwi_engine_id_2 = engine_id_2;
+ /*
+ * If first engine id holds non valid value the razwi initiator
+ * does not have engine id
+ */
+ hdev->last_error.razwi_non_engine_initiator = (engine_id_1 == U16_MAX);
+ hdev->last_error.razwi_type = razwi_type;
+
+ }
}
}
@@ -7696,14 +7819,10 @@ static void gaudi_print_fw_alive_info(struct hl_device *hdev,
fw_alive->thread_id, fw_alive->uptime_seconds);
}
-static int gaudi_soft_reset_late_init(struct hl_device *hdev)
+static int gaudi_non_hard_reset_late_init(struct hl_device *hdev)
{
- struct gaudi_device *gaudi = hdev->asic_specific;
-
- /* Unmask all IRQs since some could have been received
- * during the soft reset
- */
- return hl_fw_unmask_irq_arr(hdev, gaudi->events, sizeof(gaudi->events));
+ /* GAUDI doesn't support any reset except hard-reset */
+ return -EPERM;
}
static int gaudi_hbm_read_interrupts(struct hl_device *hdev, int device,
@@ -7897,27 +8016,39 @@ static int tpc_krn_event_to_tpc_id(u16 tpc_dec_event_type)
static void gaudi_print_clk_change_info(struct hl_device *hdev,
u16 event_type)
{
+ ktime_t zero_time = ktime_set(0, 0);
+
+ mutex_lock(&hdev->clk_throttling.lock);
+
switch (event_type) {
case GAUDI_EVENT_FIX_POWER_ENV_S:
- hdev->clk_throttling_reason |= HL_CLK_THROTTLE_POWER;
+ hdev->clk_throttling.current_reason |= HL_CLK_THROTTLE_POWER;
+ hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_POWER;
+ hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].start = ktime_get();
+ hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = zero_time;
dev_info_ratelimited(hdev->dev,
"Clock throttling due to power consumption\n");
break;
case GAUDI_EVENT_FIX_POWER_ENV_E:
- hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_POWER;
+ hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_POWER;
+ hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = ktime_get();
dev_info_ratelimited(hdev->dev,
"Power envelop is safe, back to optimal clock\n");
break;
case GAUDI_EVENT_FIX_THERMAL_ENV_S:
- hdev->clk_throttling_reason |= HL_CLK_THROTTLE_THERMAL;
+ hdev->clk_throttling.current_reason |= HL_CLK_THROTTLE_THERMAL;
+ hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_THERMAL;
+ hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].start = ktime_get();
+ hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = zero_time;
dev_info_ratelimited(hdev->dev,
"Clock throttling due to overheating\n");
break;
case GAUDI_EVENT_FIX_THERMAL_ENV_E:
- hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_THERMAL;
+ hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_THERMAL;
+ hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = ktime_get();
dev_info_ratelimited(hdev->dev,
"Thermal envelop is safe, back to optimal clock\n");
break;
@@ -7927,6 +8058,8 @@ static void gaudi_print_clk_change_info(struct hl_device *hdev,
event_type);
break;
}
+
+ mutex_unlock(&hdev->clk_throttling.lock);
}
static void gaudi_handle_eqe(struct hl_device *hdev,
@@ -7975,7 +8108,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
case GAUDI_EVENT_NIC0_CS_DBG_DERR ... GAUDI_EVENT_NIC4_CS_DBG_DERR:
gaudi_print_irq_info(hdev, event_type, true);
gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
- fw_fatal_err_flag = HL_RESET_FW_FATAL_ERR;
+ fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR;
goto reset_device;
case GAUDI_EVENT_GIC500:
@@ -7983,7 +8116,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
case GAUDI_EVENT_L2_RAM_ECC:
case GAUDI_EVENT_PLL0 ... GAUDI_EVENT_PLL17:
gaudi_print_irq_info(hdev, event_type, false);
- fw_fatal_err_flag = HL_RESET_FW_FATAL_ERR;
+ fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR;
goto reset_device;
case GAUDI_EVENT_HBM0_SPI_0:
@@ -7994,7 +8127,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
gaudi_hbm_read_interrupts(hdev,
gaudi_hbm_event_to_dev(event_type),
&eq_entry->hbm_ecc_data);
- fw_fatal_err_flag = HL_RESET_FW_FATAL_ERR;
+ fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR;
goto reset_device;
case GAUDI_EVENT_HBM0_SPI_1:
@@ -8177,9 +8310,11 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
reset_device:
if (hdev->asic_prop.fw_security_enabled)
- hl_device_reset(hdev, HL_RESET_HARD | HL_RESET_FW | fw_fatal_err_flag);
+ hl_device_reset(hdev, HL_DRV_RESET_HARD
+ | HL_DRV_RESET_BYPASS_REQ_TO_FW
+ | fw_fatal_err_flag);
else if (hdev->hard_reset_on_fw_events)
- hl_device_reset(hdev, HL_RESET_HARD | fw_fatal_err_flag);
+ hl_device_reset(hdev, HL_DRV_RESET_HARD | fw_fatal_err_flag);
else
hl_fw_unmask_irq(hdev, event_type);
}
@@ -8206,7 +8341,7 @@ static int gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
int rc;
if (!(gaudi->hw_cap_initialized & HW_CAP_MMU) ||
- hdev->hard_reset_pending)
+ hdev->reset_info.hard_reset_pending)
return 0;
if (hdev->pldm)
@@ -8229,12 +8364,6 @@ static int gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
WREG32(mmSTLB_INV_SET, 0);
- if (rc) {
- dev_err_ratelimited(hdev->dev,
- "MMU cache invalidation timeout\n");
- hl_device_reset(hdev, HL_RESET_HARD);
- }
-
return rc;
}
@@ -8662,7 +8791,7 @@ static int gaudi_internal_cb_pool_init(struct hl_device *hdev,
hdev->internal_cb_pool_dma_addr,
HOST_SPACE_INTERNAL_CB_SZ);
- hdev->asic_funcs->mmu_invalidate_cache(hdev, false, VM_TYPE_USERPTR);
+ hdev->asic_funcs->mmu_invalidate_cache(hdev, false, MMU_OP_USERPTR);
mutex_unlock(&ctx->mmu_lock);
if (rc)
@@ -8697,7 +8826,7 @@ static void gaudi_internal_cb_pool_fini(struct hl_device *hdev,
HOST_SPACE_INTERNAL_CB_SZ);
hl_unreserve_va_block(hdev, ctx, hdev->internal_cb_va_base,
HOST_SPACE_INTERNAL_CB_SZ);
- hdev->asic_funcs->mmu_invalidate_cache(hdev, true, VM_TYPE_USERPTR);
+ hdev->asic_funcs->mmu_invalidate_cache(hdev, true, MMU_OP_USERPTR);
mutex_unlock(&ctx->mmu_lock);
gen_pool_destroy(hdev->internal_cb_pool);
@@ -9458,7 +9587,7 @@ static const struct hl_asic_funcs gaudi_funcs = {
.disable_clock_gating = gaudi_disable_clock_gating,
.debug_coresight = gaudi_debug_coresight,
.is_device_idle = gaudi_is_device_idle,
- .soft_reset_late_init = gaudi_soft_reset_late_init,
+ .non_hard_reset_late_init = gaudi_non_hard_reset_late_init,
.hw_queues_lock = gaudi_hw_queues_lock,
.hw_queues_unlock = gaudi_hw_queues_unlock,
.get_pci_id = gaudi_get_pci_id,