diff options
Diffstat (limited to 'drivers/misc/habanalabs/common/device.c')
-rw-r--r-- | drivers/misc/habanalabs/common/device.c | 65 |
1 files changed, 49 insertions, 16 deletions
diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c index 5871162a8442..15fcb5c31c4b 100644 --- a/drivers/misc/habanalabs/common/device.c +++ b/drivers/misc/habanalabs/common/device.c @@ -17,12 +17,12 @@ enum hl_device_status hl_device_status(struct hl_device *hdev) { enum hl_device_status status; - if (hdev->disabled) - status = HL_DEVICE_STATUS_MALFUNCTION; - else if (atomic_read(&hdev->in_reset)) + if (atomic_read(&hdev->in_reset)) status = HL_DEVICE_STATUS_IN_RESET; else if (hdev->needs_reset) status = HL_DEVICE_STATUS_NEEDS_RESET; + else if (hdev->disabled) + status = HL_DEVICE_STATUS_MALFUNCTION; else status = HL_DEVICE_STATUS_OPERATIONAL; @@ -142,6 +142,9 @@ static int hl_mmap(struct file *filp, struct vm_area_struct *vma) switch (vm_pgoff & HL_MMAP_TYPE_MASK) { case HL_MMAP_TYPE_CB: return hl_cb_mmap(hpriv, vma); + + case HL_MMAP_TYPE_BLOCK: + return hl_hw_block_mmap(hpriv, vma); } return -EINVAL; @@ -373,7 +376,6 @@ static int device_early_init(struct hl_device *hdev) mutex_init(&hdev->send_cpu_message_lock); mutex_init(&hdev->debug_lock); - mutex_init(&hdev->mmu_cache_lock); INIT_LIST_HEAD(&hdev->cs_mirror_list); spin_lock_init(&hdev->cs_mirror_lock); INIT_LIST_HEAD(&hdev->fpriv_list); @@ -414,7 +416,6 @@ static void device_early_fini(struct hl_device *hdev) { int i; - mutex_destroy(&hdev->mmu_cache_lock); mutex_destroy(&hdev->debug_lock); mutex_destroy(&hdev->send_cpu_message_lock); @@ -1037,7 +1038,7 @@ kill_processes: if (hard_reset) { /* Release kernel context */ - if (hl_ctx_put(hdev->kernel_ctx) == 1) + if (hdev->kernel_ctx && hl_ctx_put(hdev->kernel_ctx) == 1) hdev->kernel_ctx = NULL; hl_vm_fini(hdev); hl_mmu_fini(hdev); @@ -1092,6 +1093,7 @@ kill_processes: GFP_KERNEL); if (!hdev->kernel_ctx) { rc = -ENOMEM; + hl_mmu_fini(hdev); goto out_err; } @@ -1103,6 +1105,7 @@ kill_processes: "failed to init kernel ctx in hard reset\n"); kfree(hdev->kernel_ctx); hdev->kernel_ctx = NULL; + hl_mmu_fini(hdev); goto out_err; } } @@ -1156,12 +1159,20 @@ kill_processes: atomic_set(&hdev->in_reset, 0); hdev->needs_reset = false; - if (hard_reset) + dev_notice(hdev->dev, "Successfully finished resetting the device\n"); + + if (hard_reset) { hdev->hard_reset_cnt++; - else - hdev->soft_reset_cnt++; - dev_warn(hdev->dev, "Successfully finished resetting the device\n"); + /* After reset is done, we are ready to receive events from + * the F/W. We can't do it before because we will ignore events + * and if those events are fatal, we won't know about it and + * the device will be operational although it shouldn't be + */ + hdev->asic_funcs->enable_events_from_fw(hdev); + } else { + hdev->soft_reset_cnt++; + } return 0; @@ -1312,11 +1323,16 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass) hdev->compute_ctx = NULL; + hl_debugfs_add_device(hdev); + + /* debugfs nodes are created in hl_ctx_init so it must be called after + * hl_debugfs_add_device. + */ rc = hl_ctx_init(hdev, hdev->kernel_ctx, true); if (rc) { dev_err(hdev->dev, "failed to initialize kernel context\n"); kfree(hdev->kernel_ctx); - goto mmu_fini; + goto remove_device_from_debugfs; } rc = hl_cb_pool_init(hdev); @@ -1325,8 +1341,6 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass) goto release_ctx; } - hl_debugfs_add_device(hdev); - /* * From this point, in case of an error, add char devices and create * sysfs nodes as part of the error flow, to allow debugging. @@ -1409,12 +1423,21 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass) hdev->init_done = true; + /* After initialization is done, we are ready to receive events from + * the F/W. We can't do it before because we will ignore events and if + * those events are fatal, we won't know about it and the device will + * be operational although it shouldn't be + */ + hdev->asic_funcs->enable_events_from_fw(hdev); + return 0; release_ctx: if (hl_ctx_put(hdev->kernel_ctx) != 1) dev_err(hdev->dev, "kernel ctx is still alive on initialization failure\n"); +remove_device_from_debugfs: + hl_debugfs_remove_device(hdev); mmu_fini: hl_mmu_fini(hdev); eq_fini: @@ -1480,11 +1503,21 @@ void hl_device_fini(struct hl_device *hdev) usleep_range(50, 200); rc = atomic_cmpxchg(&hdev->in_reset, 0, 1); if (ktime_compare(ktime_get(), timeout) > 0) { - WARN(1, "Failed to remove device because reset function did not finish\n"); + dev_crit(hdev->dev, + "Failed to remove device because reset function did not finish\n"); return; } } + /* Disable PCI access from device F/W so it won't send us additional + * interrupts. We disable MSI/MSI-X at the halt_engines function and we + * can't have the F/W sending us interrupts after that. We need to + * disable the access here because if the device is marked disable, the + * message won't be send. Also, in case of heartbeat, the device CPU is + * marked as disable so this message won't be sent + */ + hl_fw_send_pci_access_msg(hdev, CPUCP_PACKET_DISABLE_PCI_ACCESS); + /* Mark device as disabled */ hdev->disabled = true; @@ -1504,8 +1537,6 @@ void hl_device_fini(struct hl_device *hdev) device_late_fini(hdev); - hl_debugfs_remove_device(hdev); - /* * Halt the engines and disable interrupts so we won't get any more * completions from H/W and we won't have any accesses from the @@ -1537,6 +1568,8 @@ void hl_device_fini(struct hl_device *hdev) if ((hdev->kernel_ctx) && (hl_ctx_put(hdev->kernel_ctx) != 1)) dev_err(hdev->dev, "kernel ctx is still alive\n"); + hl_debugfs_remove_device(hdev); + hl_vm_fini(hdev); hl_mmu_fini(hdev); |