16 files changed, 2228 insertions, 1174 deletions
diff --git a/drivers/misc/habanalabs/common/command_buffer.c b/drivers/misc/habanalabs/common/command_buffer.c
index ada570f35a41..6f6a904ab6ca 100644
--- a/drivers/misc/habanalabs/common/command_buffer.c
+++ b/drivers/misc/habanalabs/common/command_buffer.c
@@ -11,7 +11,6 @@
 #include <linux/mm.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
-#include <linux/genalloc.h>
 
 static int cb_map_mem(struct hl_ctx *ctx, struct hl_cb *cb)
 {
@@ -68,9 +67,9 @@ static int cb_map_mem(struct hl_ctx *ctx, struct hl_cb *cb)
 	bus_addr = cb->bus_address;
 	offset = 0;
 	list_for_each_entry(va_block, &cb->va_block_list, node) {
-		rc = hl_mmu_map(ctx, va_block->start, bus_addr, va_block->size,
-				list_is_last(&va_block->node,
-						&cb->va_block_list));
+		rc = hl_mmu_map_page(ctx, va_block->start, bus_addr,
+				va_block->size, list_is_last(&va_block->node,
+							&cb->va_block_list));
 		if (rc) {
 			dev_err(hdev->dev, "Failed to map VA %#llx to CB\n",
 				va_block->start);
@@ -93,7 +92,7 @@ err_va_umap:
 	list_for_each_entry(va_block, &cb->va_block_list, node) {
 		if (offset <= 0)
 			break;
-		hl_mmu_unmap(ctx, va_block->start, va_block->size,
+		hl_mmu_unmap_page(ctx, va_block->start, va_block->size,
 				offset <= va_block->size);
 		offset -= va_block->size;
 	}
@@ -120,7 +119,7 @@ static void cb_unmap_mem(struct hl_ctx *ctx, struct hl_cb *cb)
 	mutex_lock(&ctx->mmu_lock);
 
 	list_for_each_entry(va_block, &cb->va_block_list, node)
-		if (hl_mmu_unmap(ctx, va_block->start, va_block->size,
+		if (hl_mmu_unmap_page(ctx, va_block->start, va_block->size,
 				list_is_last(&va_block->node,
 						&cb->va_block_list)))
 			dev_warn_ratelimited(hdev->dev,
@@ -376,17 +375,49 @@ int hl_cb_destroy(struct hl_device *hdev, struct hl_cb_mgr *mgr, u64 cb_handle)
 	return rc;
 }
 
+static int hl_cb_info(struct hl_device *hdev, struct hl_cb_mgr *mgr,
+			u64 cb_handle, u32 *usage_cnt)
+{
+	struct hl_cb *cb;
+	u32 handle;
+	int rc = 0;
+
+	/* The CB handle was given to user to do mmap, so need to shift it back
+	 * to the value which was allocated by the IDR module.
+	 */
+	cb_handle >>= PAGE_SHIFT;
+	handle = (u32) cb_handle;
+
+	spin_lock(&mgr->cb_lock);
+
+	cb = idr_find(&mgr->cb_handles, handle);
+	if (!cb) {
+		dev_err(hdev->dev,
+			"CB info failed, no match to handle 0x%x\n", handle);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	*usage_cnt = atomic_read(&cb->cs_cnt);
+
+out:
+	spin_unlock(&mgr->cb_lock);
+	return rc;
+}
+
 int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data)
 {
 	union hl_cb_args *args = data;
 	struct hl_device *hdev = hpriv->hdev;
+	enum hl_device_status status;
 	u64 handle = 0;
+	u32 usage_cnt = 0;
 	int rc;
 
-	if (hl_device_disabled_or_in_reset(hdev)) {
+	if (!hl_device_operational(hdev, &status)) {
 		dev_warn_ratelimited(hdev->dev,
 			"Device is %s. Can't execute CB IOCTL\n",
-			atomic_read(&hdev->in_reset) ? "in_reset" : "disabled");
+			hdev->status[status]);
 		return -EBUSY;
 	}
 
@@ -413,6 +444,13 @@ int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data)
 					args->in.cb_handle);
 		break;
 
+	case HL_CB_OP_INFO:
+		rc = hl_cb_info(hdev, &hpriv->cb_mgr, args->in.cb_handle,
+				&usage_cnt);
+		memset(args, 0, sizeof(*args));
+		args->out.usage_cnt = usage_cnt;
+		break;
+
 	default:
 		rc = -ENOTTY;
 		break;
@@ -517,6 +555,7 @@ int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma)
 	}
 
 	cb->mmap_size = cb->size;
+	vma->vm_pgoff = handle;
 
 	return 0;
 
diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index b2b974ecc431..beb482310a58 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -11,11 +11,25 @@
 #include <linux/uaccess.h>
 #include <linux/slab.h>
 
-#define HL_CS_FLAGS_SIG_WAIT	(HL_CS_FLAGS_SIGNAL | HL_CS_FLAGS_WAIT)
+#define HL_CS_FLAGS_TYPE_MASK	(HL_CS_FLAGS_SIGNAL | HL_CS_FLAGS_WAIT | \
+				HL_CS_FLAGS_COLLECTIVE_WAIT)
+
+/**
+ * enum hl_cs_wait_status - cs wait status
+ * @CS_WAIT_STATUS_BUSY: cs was not completed yet
+ * @CS_WAIT_STATUS_COMPLETED: cs completed
+ * @CS_WAIT_STATUS_GONE: cs completed but fence is already gone
+ */
+enum hl_cs_wait_status {
+	CS_WAIT_STATUS_BUSY,
+	CS_WAIT_STATUS_COMPLETED,
+	CS_WAIT_STATUS_GONE
+};
 
 static void job_wq_completion(struct work_struct *work);
-static long _hl_cs_wait_ioctl(struct hl_device *hdev,
-		struct hl_ctx *ctx, u64 timeout_us, u64 seq);
+static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
+				u64 timeout_us, u64 seq,
+				enum hl_cs_wait_status *status, s64 *timestamp);
 static void cs_do_release(struct kref *ref);
 
 static void hl_sob_reset(struct kref *ref)
@@ -38,6 +52,38 @@ void hl_sob_reset_error(struct kref *ref)
 			hw_sob->q_idx, hw_sob->sob_id);
 }
 
+/**
+ * hl_gen_sob_mask() - Generates a sob mask to be used in a monitor arm packet
+ * @sob_base: sob base id
+ * @sob_mask: sob user mask, each bit represents a sob offset from sob base
+ * @mask: generated mask
+ *
+ * Return: 0 if given parameters are valid
+ */
+int hl_gen_sob_mask(u16 sob_base, u8 sob_mask, u8 *mask)
+{
+	int i;
+
+	if (sob_mask == 0)
+		return -EINVAL;
+
+	if (sob_mask == 0x1) {
+		*mask = ~(1 << (sob_base & 0x7));
+	} else {
+		/* find msb in order to verify sob range is valid */
+		for (i = BITS_PER_BYTE - 1 ; i >= 0 ; i--)
+			if (BIT(i) & sob_mask)
+				break;
+
+		if (i > (HL_MAX_SOBS_PER_MONITOR - (sob_base & 0x7) - 1))
+			return -EINVAL;
+
+		*mask = ~sob_mask;
+	}
+
+	return 0;
+}
+
 static void hl_fence_release(struct kref *kref)
 {
 	struct hl_fence *fence =
@@ -53,7 +99,8 @@ static void hl_fence_release(struct kref *kref)
 		goto free;
 
 	if ((hl_cs_cmpl->type == CS_TYPE_SIGNAL) ||
-			(hl_cs_cmpl->type == CS_TYPE_WAIT)) {
+		(hl_cs_cmpl->type == CS_TYPE_WAIT) ||
+		(hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT)) {
 
 		dev_dbg(hdev->dev,
 			"CS 0x%llx type %d finished, sob_id: %d, sob_val: 0x%x\n",
@@ -80,6 +127,10 @@ static void hl_fence_release(struct kref *kref)
 		 * hence the above scenario is avoided.
 		 */
 		kref_put(&hl_cs_cmpl->hw_sob->kref, hl_sob_reset);
+
+		if (hl_cs_cmpl->type == CS_TYPE_COLLECTIVE_WAIT)
+			hdev->asic_funcs->reset_sob_group(hdev,
+					hl_cs_cmpl->sob_group);
 	}
 
 free:
@@ -102,10 +153,11 @@ static void hl_fence_init(struct hl_fence *fence)
 {
 	kref_init(&fence->refcount);
 	fence->error = 0;
+	fence->timestamp = ktime_set(0, 0);
 	init_completion(&fence->completion);
 }
 
-static void cs_get(struct hl_cs *cs)
+void cs_get(struct hl_cs *cs)
 {
 	kref_get(&cs->refcount);
 }
@@ -120,6 +172,18 @@ static void cs_put(struct hl_cs *cs)
 	kref_put(&cs->refcount, cs_do_release);
 }
 
+static void cs_job_do_release(struct kref *ref)
+{
+	struct hl_cs_job *job = container_of(ref, struct hl_cs_job, refcount);
+
+	kfree(job);
+}
+
+static void cs_job_put(struct hl_cs_job *job)
+{
+	kref_put(&job->refcount, cs_job_do_release);
+}
+
 static bool is_cb_patched(struct hl_device *hdev, struct hl_cs_job *job)
 {
 	/*
@@ -169,10 +233,7 @@ static int cs_parser(struct hl_fpriv *hpriv, struct hl_cs_job *job)
 			job->patched_cb = parser.patched_cb;
 			job->job_cb_size = parser.patched_cb_size;
 			job->contains_dma_pkt = parser.contains_dma_pkt;
-
-			spin_lock(&job->patched_cb->lock);
-			job->patched_cb->cs_cnt++;
-			spin_unlock(&job->patched_cb->lock);
+			atomic_inc(&job->patched_cb->cs_cnt);
 		}
 
 		/*
@@ -180,9 +241,7 @@ static int cs_parser(struct hl_fpriv *hpriv, struct hl_cs_job *job)
 		 * original CB anymore because it was already parsed and
 		 * won't be accessed again for this CS
 		 */
-		spin_lock(&job->user_cb->lock);
-		job->user_cb->cs_cnt--;
-		spin_unlock(&job->user_cb->lock);
+		atomic_dec(&job->user_cb->cs_cnt);
 		hl_cb_put(job->user_cb);
 		job->user_cb = NULL;
 	} else if (!rc) {
@@ -192,7 +251,7 @@ static int cs_parser(struct hl_fpriv *hpriv, struct hl_cs_job *job)
 	return rc;
 }
 
-static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
+static void complete_job(struct hl_device *hdev, struct hl_cs_job *job)
 {
 	struct hl_cs *cs = job->cs;
 
@@ -204,10 +263,7 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
 		 * created, so we need to check it's not NULL
 		 */
 		if (job->patched_cb) {
-			spin_lock(&job->patched_cb->lock);
-			job->patched_cb->cs_cnt--;
-			spin_unlock(&job->patched_cb->lock);
-
+			atomic_dec(&job->patched_cb->cs_cnt);
 			hl_cb_put(job->patched_cb);
 		}
 	}
@@ -215,13 +271,12 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
 	/* For H/W queue jobs, if a user CB was allocated by driver and MMU is
 	 * enabled, the user CB isn't released in cs_parser() and thus should be
 	 * released here.
+	 * This is also true for INT queues jobs which were allocated by driver
 	 */
-	if (job->queue_type == QUEUE_TYPE_HW &&
-			job->is_kernel_allocated_cb && hdev->mmu_enable) {
-		spin_lock(&job->user_cb->lock);
-		job->user_cb->cs_cnt--;
-		spin_unlock(&job->user_cb->lock);
-
+	if (job->is_kernel_allocated_cb &&
+		((job->queue_type == QUEUE_TYPE_HW && hdev->mmu_enable) ||
+				job->queue_type == QUEUE_TYPE_INT)) {
+		atomic_dec(&job->user_cb->cs_cnt);
 		hl_cb_put(job->user_cb);
 	}
 
@@ -239,27 +294,12 @@ static void free_job(struct hl_device *hdev, struct hl_cs_job *job)
 			job->queue_type == QUEUE_TYPE_HW)
 		cs_put(cs);
 
-	kfree(job);
-}
-
-static void cs_counters_aggregate(struct hl_device *hdev, struct hl_ctx *ctx)
-{
-	hdev->aggregated_cs_counters.device_in_reset_drop_cnt +=
-			ctx->cs_counters.device_in_reset_drop_cnt;
-	hdev->aggregated_cs_counters.out_of_mem_drop_cnt +=
-			ctx->cs_counters.out_of_mem_drop_cnt;
-	hdev->aggregated_cs_counters.parsing_drop_cnt +=
-			ctx->cs_counters.parsing_drop_cnt;
-	hdev->aggregated_cs_counters.queue_full_drop_cnt +=
-			ctx->cs_counters.queue_full_drop_cnt;
-	hdev->aggregated_cs_counters.max_cs_in_flight_drop_cnt +=
-			ctx->cs_counters.max_cs_in_flight_drop_cnt;
+	cs_job_put(job);
 }
 
 static void cs_do_release(struct kref *ref)
 {
-	struct hl_cs *cs = container_of(ref, struct hl_cs,
-						refcount);
+	struct hl_cs *cs = container_of(ref, struct hl_cs, refcount);
 	struct hl_device *hdev = cs->ctx->hdev;
 	struct hl_cs_job *job, *tmp;
 
@@ -268,77 +308,78 @@ static void cs_do_release(struct kref *ref)
 	/*
 	 * Although if we reached here it means that all external jobs have
 	 * finished, because each one of them took refcnt to CS, we still
-	 * need to go over the internal jobs and free them. Otherwise, we
+	 * need to go over the internal jobs and complete them. Otherwise, we
 	 * will have leaked memory and what's worse, the CS object (and
 	 * potentially the CTX object) could be released, while the JOB
 	 * still holds a pointer to them (but no reference).
 	 */
 	list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
-		free_job(hdev, job);
+		complete_job(hdev, job);
 
-	/* We also need to update CI for internal queues */
-	if (cs->submitted) {
-		hdev->asic_funcs->hw_queues_lock(hdev);
+	if (!cs->submitted) {
+		/* In case the wait for signal CS was submitted, the put occurs
+		 * in init_signal_wait_cs() or collective_wait_init_cs()
+		 * right before hanging on the PQ.
+		 */
+		if (cs->type == CS_TYPE_WAIT ||
+				cs->type == CS_TYPE_COLLECTIVE_WAIT)
+			hl_fence_put(cs->signal_fence);
 
-		hdev->cs_active_cnt--;
-		if (!hdev->cs_active_cnt) {
-			struct hl_device_idle_busy_ts *ts;
+		goto out;
+	}
 
-			ts = &hdev->idle_busy_ts_arr[hdev->idle_busy_ts_idx++];
-			ts->busy_to_idle_ts = ktime_get();
+	hdev->asic_funcs->hw_queues_lock(hdev);
 
-			if (hdev->idle_busy_ts_idx == HL_IDLE_BUSY_TS_ARR_SIZE)
-				hdev->idle_busy_ts_idx = 0;
-		} else if (hdev->cs_active_cnt < 0) {
-			dev_crit(hdev->dev, "CS active cnt %d is negative\n",
-				hdev->cs_active_cnt);
-		}
+	hdev->cs_active_cnt--;
+	if (!hdev->cs_active_cnt) {
+		struct hl_device_idle_busy_ts *ts;
 
-		hdev->asic_funcs->hw_queues_unlock(hdev);
+		ts = &hdev->idle_busy_ts_arr[hdev->idle_busy_ts_idx++];
+		ts->busy_to_idle_ts = ktime_get();
 
-		hl_int_hw_queue_update_ci(cs);
+		if (hdev->idle_busy_ts_idx == HL_IDLE_BUSY_TS_ARR_SIZE)
+			hdev->idle_busy_ts_idx = 0;
+	} else if (hdev->cs_active_cnt < 0) {
+		dev_crit(hdev->dev, "CS active cnt %d is negative\n",
+			hdev->cs_active_cnt);
+	}
 
-		spin_lock(&hdev->hw_queues_mirror_lock);
-		/* remove CS from hw_queues mirror list */
-		list_del_init(&cs->mirror_node);
-		spin_unlock(&hdev->hw_queues_mirror_lock);
+	hdev->asic_funcs->hw_queues_unlock(hdev);
 
-		/*
-		 * Don't cancel TDR in case this CS was timedout because we
-		 * might be running from the TDR context
-		 */
-		if ((!cs->timedout) &&
-			(hdev->timeout_jiffies != MAX_SCHEDULE_TIMEOUT)) {
-			struct hl_cs *next;
+	/* Need to update CI for internal queues */
+	hl_int_hw_queue_update_ci(cs);
 
-			if (cs->tdr_active)
-				cancel_delayed_work_sync(&cs->work_tdr);
+	/* remove CS from CS mirror list */
+	spin_lock(&hdev->cs_mirror_lock);
+	list_del_init(&cs->mirror_node);
+	spin_unlock(&hdev->cs_mirror_lock);
 
-			spin_lock(&hdev->hw_queues_mirror_lock);
+	/* Don't cancel TDR in case this CS was timedout because we might be
+	 * running from the TDR context
+	 */
+	if (!cs->timedout && hdev->timeout_jiffies != MAX_SCHEDULE_TIMEOUT) {
+		struct hl_cs *next;
 
-			/* queue TDR for next CS */
-			next = list_first_entry_or_null(
-					&hdev->hw_queues_mirror_list,
-					struct hl_cs, mirror_node);
+		if (cs->tdr_active)
+			cancel_delayed_work_sync(&cs->work_tdr);
 
-			if ((next) && (!next->tdr_active)) {
-				next->tdr_active = true;
-				schedule_delayed_work(&next->work_tdr,
-							hdev->timeout_jiffies);
-			}
+		spin_lock(&hdev->cs_mirror_lock);
 
-			spin_unlock(&hdev->hw_queues_mirror_lock);
+		/* queue TDR for next CS */
+		next = list_first_entry_or_null(&hdev->cs_mirror_list,
+						struct hl_cs, mirror_node);
+
+		if (next && !next->tdr_active) {
+			next->tdr_active = true;
+			schedule_delayed_work(&next->work_tdr,
+						hdev->timeout_jiffies);
 		}
-	} else if (cs->type == CS_TYPE_WAIT) {
-		/*
-		 * In case the wait for signal CS was submitted, the put occurs
-		 * in init_signal_wait_cs() right before hanging on the PQ.
-		 */
-		hl_fence_put(cs->signal_fence);
+
+		spin_unlock(&hdev->cs_mirror_lock);
 	}
 
-	/*
-	 * Must be called before hl_ctx_put because inside we use ctx to get
+out:
+	/* Must be called before hl_ctx_put because inside we use ctx to get
 	 * the device
 	 */
 	hl_debugfs_remove_cs(cs);
@@ -356,9 +397,10 @@ static void cs_do_release(struct kref *ref)
 	else if (!cs->submitted)
 		cs->fence->error = -EBUSY;
 
+	if (cs->timestamp)
+		cs->fence->timestamp = ktime_get();
 	complete_all(&cs->fence->completion);
 	hl_fence_put(cs->fence);
-	cs_counters_aggregate(hdev, cs->ctx);
 
 	kfree(cs->jobs_in_queue_cnt);
 	kfree(cs);
@@ -384,24 +426,51 @@ static void cs_timedout(struct work_struct *work)
 
 	hdev = cs->ctx->hdev;
 
-	dev_err(hdev->dev,
-		"Command submission %llu has not finished in time!\n",
-		cs->sequence);
+	switch (cs->type) {
+	case CS_TYPE_SIGNAL:
+		dev_err(hdev->dev,
+			"Signal command submission %llu has not finished in time!\n",
+			cs->sequence);
+		break;
+
+	case CS_TYPE_WAIT:
+		dev_err(hdev->dev,
+			"Wait command submission %llu has not finished in time!\n",
+			cs->sequence);
+		break;
+
+	case CS_TYPE_COLLECTIVE_WAIT:
+		dev_err(hdev->dev,
+			"Collective Wait command submission %llu has not finished in time!\n",
+			cs->sequence);
+		break;
+
+	default:
+		dev_err(hdev->dev,
+			"Command submission %llu has not finished in time!\n",
+			cs->sequence);
+		break;
+	}
 
 	cs_put(cs);
 
 	if (hdev->reset_on_lockup)
 		hl_device_reset(hdev, false, false);
+	else
+		hdev->needs_reset = true;
 }
 
 static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 			enum hl_cs_type cs_type, struct hl_cs **cs_new)
 {
-	struct hl_cs_compl *cs_cmpl;
+	struct hl_cs_counters_atomic *cntr;
 	struct hl_fence *other = NULL;
+	struct hl_cs_compl *cs_cmpl;
 	struct hl_cs *cs;
 	int rc;
 
+	cntr = &hdev->aggregated_cs_counters;
+
 	cs = kzalloc(sizeof(*cs), GFP_ATOMIC);
 	if (!cs)
 		return -ENOMEM;
@@ -435,7 +504,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 	if (other && !completion_done(&other->completion)) {
 		dev_dbg_ratelimited(hdev->dev,
 			"Rejecting CS because of too many in-flights CS\n");
-		ctx->cs_counters.max_cs_in_flight_drop_cnt++;
+		atomic64_inc(&ctx->cs_counters.max_cs_in_flight_drop_cnt);
+		atomic64_inc(&cntr->max_cs_in_flight_drop_cnt);
 		rc = -EAGAIN;
 		goto free_fence;
 	}
@@ -480,7 +550,7 @@ static void cs_rollback(struct hl_device *hdev, struct hl_cs *cs)
 	struct hl_cs_job *job, *tmp;
 
 	list_for_each_entry_safe(job, tmp, &cs->job_list, cs_node)
-		free_job(hdev, job);
+		complete_job(hdev, job);
 }
 
 void hl_cs_rollback_all(struct hl_device *hdev)
@@ -493,8 +563,7 @@ void hl_cs_rollback_all(struct hl_device *hdev)
 		flush_workqueue(hdev->cq_wq[i]);
 
 	/* Make sure we don't have leftovers in the H/W queues mirror list */
-	list_for_each_entry_safe(cs, tmp, &hdev->hw_queues_mirror_list,
-				mirror_node) {
+	list_for_each_entry_safe(cs, tmp, &hdev->cs_mirror_list, mirror_node) {
 		cs_get(cs);
 		cs->aborted = true;
 		dev_warn_ratelimited(hdev->dev, "Killing CS %d.%llu\n",
@@ -512,7 +581,7 @@ static void job_wq_completion(struct work_struct *work)
 	struct hl_device *hdev = cs->ctx->hdev;
 
 	/* job is no longer needed */
-	free_job(hdev, job);
+	complete_job(hdev, job);
 }
 
 static int validate_queue_index(struct hl_device *hdev,
@@ -547,9 +616,36 @@ static int validate_queue_index(struct hl_device *hdev,
 		return -EINVAL;
 	}
 
-	*queue_type = hw_queue_prop->type;
-	*is_kernel_allocated_cb = !!hw_queue_prop->requires_kernel_cb;
+	/* When hw queue type isn't QUEUE_TYPE_HW,
+	 * USER_ALLOC_CB flag shall be referred as "don't care".
+	 */
+	if (hw_queue_prop->type == QUEUE_TYPE_HW) {
+		if (chunk->cs_chunk_flags & HL_CS_CHUNK_FLAGS_USER_ALLOC_CB) {
+			if (!(hw_queue_prop->cb_alloc_flags & CB_ALLOC_USER)) {
+				dev_err(hdev->dev,
+					"Queue index %d doesn't support user CB\n",
+					chunk->queue_index);
+				return -EINVAL;
+			}
+
+			*is_kernel_allocated_cb = false;
+		} else {
+			if (!(hw_queue_prop->cb_alloc_flags &
+					CB_ALLOC_KERNEL)) {
+				dev_err(hdev->dev,
+					"Queue index %d doesn't support kernel CB\n",
+					chunk->queue_index);
+				return -EINVAL;
+			}
+
+			*is_kernel_allocated_cb = true;
+		}
+	} else {
+		*is_kernel_allocated_cb = !!(hw_queue_prop->cb_alloc_flags
+						& CB_ALLOC_KERNEL);
+	}
 
+	*queue_type = hw_queue_prop->type;
 	return 0;
 }
 
@@ -573,9 +669,7 @@ static struct hl_cb *get_cb_from_cs_chunk(struct hl_device *hdev,
 		goto release_cb;
 	}
 
-	spin_lock(&cb->lock);
-	cb->cs_cnt++;
-	spin_unlock(&cb->lock);
+	atomic_inc(&cb->cs_cnt);
 
 	return cb;
 
@@ -593,6 +687,7 @@ struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
 	if (!job)
 		return NULL;
 
+	kref_init(&job->refcount);
 	job->queue_type = queue_type;
 	job->is_kernel_allocated_cb = is_kernel_allocated_cb;
 
@@ -605,42 +700,115 @@ struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
 	return job;
 }
 
-static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
-				u32 num_chunks, u64 *cs_seq)
+static enum hl_cs_type hl_cs_get_cs_type(u32 cs_type_flags)
+{
+	if (cs_type_flags & HL_CS_FLAGS_SIGNAL)
+		return CS_TYPE_SIGNAL;
+	else if (cs_type_flags & HL_CS_FLAGS_WAIT)
+		return CS_TYPE_WAIT;
+	else if (cs_type_flags & HL_CS_FLAGS_COLLECTIVE_WAIT)
+		return CS_TYPE_COLLECTIVE_WAIT;
+	else
+		return CS_TYPE_DEFAULT;
+}
+
+static int hl_cs_sanity_checks(struct hl_fpriv *hpriv, union hl_cs_args *args)
 {
 	struct hl_device *hdev = hpriv->hdev;
-	struct hl_cs_chunk *cs_chunk_array;
-	struct hl_cs_job *job;
-	struct hl_cs *cs;
-	struct hl_cb *cb;
-	bool int_queues_only = true;
-	u32 size_to_copy;
-	int rc, i;
+	struct hl_ctx *ctx = hpriv->ctx;
+	u32 cs_type_flags, num_chunks;
+	enum hl_device_status status;
+	enum hl_cs_type cs_type;
 
-	*cs_seq = ULLONG_MAX;
+	if (!hl_device_operational(hdev, &status)) {
+		dev_warn_ratelimited(hdev->dev,
+			"Device is %s. Can't submit new CS\n",
+			hdev->status[status]);
+		return -EBUSY;
+	}
+
+	cs_type_flags = args->in.cs_flags & HL_CS_FLAGS_TYPE_MASK;
+
+	if (unlikely(cs_type_flags && !is_power_of_2(cs_type_flags))) {
+		dev_err(hdev->dev,
+			"CS type flags are mutually exclusive, context %d\n",
+			ctx->asid);
+		return -EINVAL;
+	}
+
+	cs_type = hl_cs_get_cs_type(cs_type_flags);
+	num_chunks = args->in.num_chunks_execute;
+
+	if (unlikely((cs_type != CS_TYPE_DEFAULT) &&
+					!hdev->supports_sync_stream)) {
+		dev_err(hdev->dev, "Sync stream CS is not supported\n");
+		return -EINVAL;
+	}
+
+	if (cs_type == CS_TYPE_DEFAULT) {
+		if (!num_chunks) {
+			dev_err(hdev->dev,
+				"Got execute CS with 0 chunks, context %d\n",
+				ctx->asid);
+			return -EINVAL;
+		}
+	} else if (num_chunks != 1) {
+		dev_err(hdev->dev,
+			"Sync stream CS mandates one chunk only, context %d\n",
+			ctx->asid);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int hl_cs_copy_chunk_array(struct hl_device *hdev,
+					struct hl_cs_chunk **cs_chunk_array,
+					void __user *chunks, u32 num_chunks)
+{
+	u32 size_to_copy;
 
 	if (num_chunks > HL_MAX_JOBS_PER_CS) {
 		dev_err(hdev->dev,
 			"Number of chunks can NOT be larger than %d\n",
 			HL_MAX_JOBS_PER_CS);
-		rc = -EINVAL;
-		goto out;
+		return -EINVAL;
 	}
 
-	cs_chunk_array = kmalloc_array(num_chunks, sizeof(*cs_chunk_array),
+	*cs_chunk_array = kmalloc_array(num_chunks, sizeof(**cs_chunk_array),
 					GFP_ATOMIC);
-	if (!cs_chunk_array) {
-		rc = -ENOMEM;
-		goto out;
-	}
+	if (!*cs_chunk_array)
+		return -ENOMEM;
 
 	size_to_copy = num_chunks * sizeof(struct hl_cs_chunk);
-	if (copy_from_user(cs_chunk_array, chunks, size_to_copy)) {
+	if (copy_from_user(*cs_chunk_array, chunks, size_to_copy)) {
 		dev_err(hdev->dev, "Failed to copy cs chunk array from user\n");
-		rc = -EFAULT;
-		goto free_cs_chunk_array;
+		kfree(*cs_chunk_array);
+		return -EFAULT;
 	}
 
+	return 0;
+}
+
+static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
+				u32 num_chunks, u64 *cs_seq, bool timestamp)
+{
+	bool int_queues_only = true;
+	struct hl_device *hdev = hpriv->hdev;
+	struct hl_cs_chunk *cs_chunk_array;
+	struct hl_cs_counters_atomic *cntr;
+	struct hl_cs_job *job;
+	struct hl_cs *cs;
+	struct hl_cb *cb;
+	int rc, i;
+
+	cntr = &hdev->aggregated_cs_counters;
+	*cs_seq = ULLONG_MAX;
+
+	rc = hl_cs_copy_chunk_array(hdev, &cs_chunk_array, chunks, num_chunks);
+	if (rc)
+		goto out;
+
 	/* increment refcnt for context */
 	hl_ctx_get(hdev, hpriv->ctx);
 
@@ -650,6 +818,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
 		goto free_cs_chunk_array;
 	}
 
+	cs->timestamp = !!timestamp;
 	*cs_seq = cs->sequence;
 
 	hl_debugfs_add_cs(cs);
@@ -663,14 +832,17 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
 		rc = validate_queue_index(hdev, chunk, &queue_type,
 						&is_kernel_allocated_cb);
 		if (rc) {
-			hpriv->ctx->cs_counters.parsing_drop_cnt++;
+			atomic64_inc(&hpriv->ctx->cs_counters.parsing_drop_cnt);
+			atomic64_inc(&cntr->parsing_drop_cnt);
 			goto free_cs_object;
 		}
 
 		if (is_kernel_allocated_cb) {
 			cb = get_cb_from_cs_chunk(hdev, &hpriv->cb_mgr, chunk);
 			if (!cb) {
-				hpriv->ctx->cs_counters.parsing_drop_cnt++;
+				atomic64_inc(
+				&hpriv->ctx->cs_counters.parsing_drop_cnt);
+				atomic64_inc(&cntr->parsing_drop_cnt);
 				rc = -EINVAL;
 				goto free_cs_object;
 			}
@@ -684,7 +856,9 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
 		job = hl_cs_allocate_job(hdev, queue_type,
 						is_kernel_allocated_cb);
 		if (!job) {
-			hpriv->ctx->cs_counters.out_of_mem_drop_cnt++;
+			atomic64_inc(
+			&hpriv->ctx->cs_counters.out_of_mem_drop_cnt);
+			atomic64_inc(&cntr->out_of_mem_drop_cnt);
 			dev_err(hdev->dev, "Failed to allocate a new job\n");
 			rc = -ENOMEM;
 			if (is_kernel_allocated_cb)
@@ -717,7 +891,8 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
 
 		rc = cs_parser(hpriv, job);
 		if (rc) {
-			hpriv->ctx->cs_counters.parsing_drop_cnt++;
+			atomic64_inc(&hpriv->ctx->cs_counters.parsing_drop_cnt);
+			atomic64_inc(&cntr->parsing_drop_cnt);
 			dev_err(hdev->dev,
 				"Failed to parse JOB %d.%llu.%d, err %d, rejecting the CS\n",
 				cs->ctx->asid, cs->sequence, job->id, rc);
@@ -726,7 +901,8 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
 	}
 
 	if (int_queues_only) {
-		hpriv->ctx->cs_counters.parsing_drop_cnt++;
+		atomic64_inc(&hpriv->ctx->cs_counters.parsing_drop_cnt);
+		atomic64_inc(&cntr->parsing_drop_cnt);
 		dev_err(hdev->dev,
 			"Reject CS %d.%llu because only internal queues jobs are present\n",
 			cs->ctx->asid, cs->sequence);
@@ -747,9 +923,7 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
 	goto put_cs;
 
 release_cb:
-	spin_lock(&cb->lock);
-	cb->cs_cnt--;
-	spin_unlock(&cb->lock);
+	atomic_dec(&cb->cs_cnt);
 	hl_cb_put(cb);
 free_cs_object:
 	cs_rollback(hdev, cs);
@@ -764,47 +938,234 @@ out:
 	return rc;
 }
 
-static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
-				void __user *chunks, u32 num_chunks,
+static int hl_cs_ctx_switch(struct hl_fpriv *hpriv, union hl_cs_args *args,
 				u64 *cs_seq)
 {
 	struct hl_device *hdev = hpriv->hdev;
 	struct hl_ctx *ctx = hpriv->ctx;
-	struct hl_cs_chunk *cs_chunk_array, *chunk;
-	struct hw_queue_properties *hw_queue_prop;
-	struct hl_fence *sig_fence = NULL;
-	struct hl_cs_job *job;
-	struct hl_cs *cs;
-	struct hl_cb *cb;
-	enum hl_queue_type q_type;
-	u64 *signal_seq_arr = NULL, signal_seq;
-	u32 size_to_copy, q_idx, signal_seq_arr_len, cb_size;
-	int rc;
+	bool need_soft_reset = false;
+	int rc = 0, do_ctx_switch;
+	void __user *chunks;
+	u32 num_chunks, tmp;
+	int ret;
 
-	*cs_seq = ULLONG_MAX;
+	do_ctx_switch = atomic_cmpxchg(&ctx->thread_ctx_switch_token, 1, 0);
 
-	if (num_chunks > HL_MAX_JOBS_PER_CS) {
+	if (do_ctx_switch || (args->in.cs_flags & HL_CS_FLAGS_FORCE_RESTORE)) {
+		mutex_lock(&hpriv->restore_phase_mutex);
+
+		if (do_ctx_switch) {
+			rc = hdev->asic_funcs->context_switch(hdev, ctx->asid);
+			if (rc) {
+				dev_err_ratelimited(hdev->dev,
+					"Failed to switch to context %d, rejecting CS! %d\n",
+					ctx->asid, rc);
+				/*
+				 * If we timedout, or if the device is not IDLE
+				 * while we want to do context-switch (-EBUSY),
+				 * we need to soft-reset because QMAN is
+				 * probably stuck. However, we can't call to
+				 * reset here directly because of deadlock, so
+				 * need to do it at the very end of this
+				 * function
+				 */
+				if ((rc == -ETIMEDOUT) || (rc == -EBUSY))
+					need_soft_reset = true;
+				mutex_unlock(&hpriv->restore_phase_mutex);
+				goto out;
+			}
+		}
+
+		hdev->asic_funcs->restore_phase_topology(hdev);
+
+		chunks = (void __user *) (uintptr_t) args->in.chunks_restore;
+		num_chunks = args->in.num_chunks_restore;
+
+		if (!num_chunks) {
+			dev_dbg(hdev->dev,
+				"Need to run restore phase but restore CS is empty\n");
+			rc = 0;
+		} else {
+			rc = cs_ioctl_default(hpriv, chunks, num_chunks,
+						cs_seq, false);
+		}
+
+		mutex_unlock(&hpriv->restore_phase_mutex);
+
+		if (rc) {
+			dev_err(hdev->dev,
+				"Failed to submit restore CS for context %d (%d)\n",
+				ctx->asid, rc);
+			goto out;
+		}
+
+		/* Need to wait for restore completion before execution phase */
+		if (num_chunks) {
+			enum hl_cs_wait_status status;
+wait_again:
+			ret = _hl_cs_wait_ioctl(hdev, ctx,
+					jiffies_to_usecs(hdev->timeout_jiffies),
+					*cs_seq, &status, NULL);
+			if (ret) {
+				if (ret == -ERESTARTSYS) {
+					usleep_range(100, 200);
+					goto wait_again;
+				}
+
+				dev_err(hdev->dev,
+					"Restore CS for context %d failed to complete %d\n",
+					ctx->asid, ret);
+				rc = -ENOEXEC;
+				goto out;
+			}
+		}
+
+		ctx->thread_ctx_switch_wait_token = 1;
+
+	} else if (!ctx->thread_ctx_switch_wait_token) {
+		rc = hl_poll_timeout_memory(hdev,
+			&ctx->thread_ctx_switch_wait_token, tmp, (tmp == 1),
+			100, jiffies_to_usecs(hdev->timeout_jiffies), false);
+
+		if (rc == -ETIMEDOUT) {
+			dev_err(hdev->dev,
+				"context switch phase timeout (%d)\n", tmp);
+			goto out;
+		}
+	}
+
+out:
+	if ((rc == -ETIMEDOUT || rc == -EBUSY) && (need_soft_reset))
+		hl_device_reset(hdev, false, false);
+
+	return rc;
+}
+
+static int cs_ioctl_extract_signal_seq(struct hl_device *hdev,
+		struct hl_cs_chunk *chunk, u64 *signal_seq)
+{
+	u64 *signal_seq_arr = NULL;
+	u32 size_to_copy, signal_seq_arr_len;
+	int rc = 0;
+
+	signal_seq_arr_len = chunk->num_signal_seq_arr;
+
+	/* currently only one signal seq is supported */
+	if (signal_seq_arr_len != 1) {
 		dev_err(hdev->dev,
-			"Number of chunks can NOT be larger than %d\n",
-			HL_MAX_JOBS_PER_CS);
-		rc = -EINVAL;
-		goto out;
+			"Wait for signal CS supports only one signal CS seq\n");
+		return -EINVAL;
 	}
 
-	cs_chunk_array = kmalloc_array(num_chunks, sizeof(*cs_chunk_array),
+	signal_seq_arr = kmalloc_array(signal_seq_arr_len,
+					sizeof(*signal_seq_arr),
 					GFP_ATOMIC);
-	if (!cs_chunk_array) {
-		rc = -ENOMEM;
+	if (!signal_seq_arr)
+		return -ENOMEM;
+
+	size_to_copy = chunk->num_signal_seq_arr * sizeof(*signal_seq_arr);
+	if (copy_from_user(signal_seq_arr,
+				u64_to_user_ptr(chunk->signal_seq_arr),
+				size_to_copy)) {
+		dev_err(hdev->dev,
+			"Failed to copy signal seq array from user\n");
+		rc = -EFAULT;
 		goto out;
 	}
 
-	size_to_copy = num_chunks * sizeof(struct hl_cs_chunk);
-	if (copy_from_user(cs_chunk_array, chunks, size_to_copy)) {
-		dev_err(hdev->dev, "Failed to copy cs chunk array from user\n");
-		rc = -EFAULT;
-		goto free_cs_chunk_array;
+	/* currently it is guaranteed to have only one signal seq */
+	*signal_seq = signal_seq_arr[0];
+
+out:
+	kfree(signal_seq_arr);
+
+	return rc;
+}
+
+static int cs_ioctl_signal_wait_create_jobs(struct hl_device *hdev,
+		struct hl_ctx *ctx, struct hl_cs *cs, enum hl_queue_type q_type,
+		u32 q_idx)
+{
+	struct hl_cs_counters_atomic *cntr;
+	struct hl_cs_job *job;
+	struct hl_cb *cb;
+	u32 cb_size;
+
+	cntr = &hdev->aggregated_cs_counters;
+
+	job = hl_cs_allocate_job(hdev, q_type, true);
+	if (!job) {
+		atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
+		atomic64_inc(&cntr->out_of_mem_drop_cnt);
+		dev_err(hdev->dev, "Failed to allocate a new job\n");
+		return -ENOMEM;
 	}
 
+	if (cs->type == CS_TYPE_WAIT)
+		cb_size = hdev->asic_funcs->get_wait_cb_size(hdev);
+	else
+		cb_size = hdev->asic_funcs->get_signal_cb_size(hdev);
+
+	cb = hl_cb_kernel_create(hdev, cb_size,
+				q_type == QUEUE_TYPE_HW && hdev->mmu_enable);
+	if (!cb) {
+		atomic64_inc(&ctx->cs_counters.out_of_mem_drop_cnt);
+		atomic64_inc(&cntr->out_of_mem_drop_cnt);
+		kfree(job);
+		return -EFAULT;
+	}
+
+	job->id = 0;
+	job->cs = cs;
+	job->user_cb = cb;
+	atomic_inc(&job->user_cb->cs_cnt);
+	job->user_cb_size = cb_size;
+	job->hw_queue_id = q_idx;
+
+	/*
+	 * No need in parsing, user CB is the patched CB.
+	 * We call hl_cb_destroy() out of two reasons - we don't need the CB in
+	 * the CB idr anymore and to decrement its refcount as it was
+	 * incremented inside hl_cb_kernel_create().
+	 */
+	job->patched_cb = job->user_cb;
+	job->job_cb_size = job->user_cb_size;
+	hl_cb_destroy(hdev, &hdev->kernel_cb_mgr, cb->id << PAGE_SHIFT);
+
+	/* increment refcount as for external queues we get completion */
+	cs_get(cs);
+
+	cs->jobs_in_queue_cnt[job->hw_queue_id]++;
+
+	list_add_tail(&job->cs_node, &cs->job_list);
+
+	hl_debugfs_add_job(hdev, job);
+
+	return 0;
+}
+
+static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
+				void __user *chunks, u32 num_chunks,
+				u64 *cs_seq, bool timestamp)
+{
+	struct hl_cs_chunk *cs_chunk_array, *chunk;
+	struct hw_queue_properties *hw_queue_prop;
+	struct hl_device *hdev = hpriv->hdev;
+	struct hl_cs_compl *sig_waitcs_cmpl;
+	u32 q_idx, collective_engine_id = 0;
+	struct hl_fence *sig_fence = NULL;
+	struct hl_ctx *ctx = hpriv->ctx;
+	enum hl_queue_type q_type;
+	struct hl_cs *cs;
+	u64 signal_seq;
+	int rc;
+
+	*cs_seq = ULLONG_MAX;
+
+	rc = hl_cs_copy_chunk_array(hdev, &cs_chunk_array, chunks, num_chunks);
+	if (rc)
+		goto out;
+
 	/* currently it is guaranteed to have only one chunk */
 	chunk = &cs_chunk_array[0];
 
@@ -819,60 +1180,43 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 	hw_queue_prop = &hdev->asic_prop.hw_queues_props[q_idx];
 	q_type = hw_queue_prop->type;
 
-	if ((q_idx >= hdev->asic_prop.max_queues) ||
-			(!hw_queue_prop->supports_sync_stream)) {
-		dev_err(hdev->dev, "Queue index %d is invalid\n", q_idx);
+	if (!hw_queue_prop->supports_sync_stream) {
+		dev_err(hdev->dev,
+			"Queue index %d does not support sync stream operations\n",
+			q_idx);
 		rc = -EINVAL;
 		goto free_cs_chunk_array;
 	}
 
-	if (cs_type == CS_TYPE_WAIT) {
-		struct hl_cs_compl *sig_waitcs_cmpl;
-
-		signal_seq_arr_len = chunk->num_signal_seq_arr;
-
-		/* currently only one signal seq is supported */
-		if (signal_seq_arr_len != 1) {
+	if (cs_type == CS_TYPE_COLLECTIVE_WAIT) {
+		if (!(hw_queue_prop->collective_mode == HL_COLLECTIVE_MASTER)) {
 			dev_err(hdev->dev,
-				"Wait for signal CS supports only one signal CS seq\n");
+				"Queue index %d is invalid\n", q_idx);
 			rc = -EINVAL;
 			goto free_cs_chunk_array;
 		}
 
-		signal_seq_arr = kmalloc_array(signal_seq_arr_len,
-						sizeof(*signal_seq_arr),
-						GFP_ATOMIC);
-		if (!signal_seq_arr) {
-			rc = -ENOMEM;
-			goto free_cs_chunk_array;
-		}
+		collective_engine_id = chunk->collective_engine_id;
+	}
 
-		size_to_copy = chunk->num_signal_seq_arr *
-				sizeof(*signal_seq_arr);
-		if (copy_from_user(signal_seq_arr,
-					u64_to_user_ptr(chunk->signal_seq_arr),
-					size_to_copy)) {
-			dev_err(hdev->dev,
-				"Failed to copy signal seq array from user\n");
-			rc = -EFAULT;
-			goto free_signal_seq_array;
-		}
+	if (cs_type == CS_TYPE_WAIT || cs_type == CS_TYPE_COLLECTIVE_WAIT) {
+		rc = cs_ioctl_extract_signal_seq(hdev, chunk, &signal_seq);
+		if (rc)
+			goto free_cs_chunk_array;
 
-		/* currently it is guaranteed to have only one signal seq */
-		signal_seq = signal_seq_arr[0];
 		sig_fence = hl_ctx_get_fence(ctx, signal_seq);
 		if (IS_ERR(sig_fence)) {
 			dev_err(hdev->dev,
 				"Failed to get signal CS with seq 0x%llx\n",
 				signal_seq);
 			rc = PTR_ERR(sig_fence);
-			goto free_signal_seq_array;
+			goto free_cs_chunk_array;
 		}
 
 		if (!sig_fence) {
 			/* signal CS already finished */
 			rc = 0;
-			goto free_signal_seq_array;
+			goto free_cs_chunk_array;
 		}
 
 		sig_waitcs_cmpl =
@@ -884,14 +1228,14 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 				signal_seq);
 			hl_fence_put(sig_fence);
 			rc = -EINVAL;
-			goto free_signal_seq_array;
+			goto free_cs_chunk_array;
 		}
 
 		if (completion_done(&sig_fence->completion)) {
 			/* signal CS already finished */
 			hl_fence_put(sig_fence);
 			rc = 0;
-			goto free_signal_seq_array;
+			goto free_cs_chunk_array;
 		}
 	}
 
@@ -900,70 +1244,37 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 
 	rc = allocate_cs(hdev, ctx, cs_type, &cs);
 	if (rc) {
-		if (cs_type == CS_TYPE_WAIT)
+		if (cs_type == CS_TYPE_WAIT ||
+			cs_type == CS_TYPE_COLLECTIVE_WAIT)
 			hl_fence_put(sig_fence);
 		hl_ctx_put(ctx);
-		goto free_signal_seq_array;
+		goto free_cs_chunk_array;
 	}
 
+	cs->timestamp = !!timestamp;
+
 	/*
 	 * Save the signal CS fence for later initialization right before
 	 * hanging the wait CS on the queue.
 	 */
-	if (cs->type == CS_TYPE_WAIT)
+	if (cs_type == CS_TYPE_WAIT || cs_type == CS_TYPE_COLLECTIVE_WAIT)
 		cs->signal_fence = sig_fence;
 
 	hl_debugfs_add_cs(cs);
 
 	*cs_seq = cs->sequence;
 
-	job = hl_cs_allocate_job(hdev, q_type, true);
-	if (!job) {
-		ctx->cs_counters.out_of_mem_drop_cnt++;
-		dev_err(hdev->dev, "Failed to allocate a new job\n");
-		rc = -ENOMEM;
-		goto put_cs;
-	}
-
-	if (cs->type == CS_TYPE_WAIT)
-		cb_size = hdev->asic_funcs->get_wait_cb_size(hdev);
+	if (cs_type == CS_TYPE_WAIT || cs_type == CS_TYPE_SIGNAL)
+		rc = cs_ioctl_signal_wait_create_jobs(hdev, ctx, cs, q_type,
+				q_idx);
+	else if (cs_type == CS_TYPE_COLLECTIVE_WAIT)
+		rc = hdev->asic_funcs->collective_wait_create_jobs(hdev, ctx,
+				cs, q_idx, collective_engine_id);
 	else
-		cb_size = hdev->asic_funcs->get_signal_cb_size(hdev);
-
-	cb = hl_cb_kernel_create(hdev, cb_size,
-				q_type == QUEUE_TYPE_HW && hdev->mmu_enable);
-	if (!cb) {
-		ctx->cs_counters.out_of_mem_drop_cnt++;
-		kfree(job);
-		rc = -EFAULT;
-		goto put_cs;
-	}
-
-	job->id = 0;
-	job->cs = cs;
-	job->user_cb = cb;
-	job->user_cb->cs_cnt++;
-	job->user_cb_size = cb_size;
-	job->hw_queue_id = q_idx;
-
-	/*
-	 * No need in parsing, user CB is the patched CB.
-	 * We call hl_cb_destroy() out of two reasons - we don't need the CB in
-	 * the CB idr anymore and to decrement its refcount as it was
-	 * incremented inside hl_cb_kernel_create().
-	 */
-	job->patched_cb = job->user_cb;
-	job->job_cb_size = job->user_cb_size;
-	hl_cb_destroy(hdev, &hdev->kernel_cb_mgr, cb->id << PAGE_SHIFT);
-
-	cs->jobs_in_queue_cnt[job->hw_queue_id]++;
-
-	list_add_tail(&job->cs_node, &cs->job_list);
-
-	/* increment refcount as for external queues we get completion */
-	cs_get(cs);
+		rc = -EINVAL;
 
-	hl_debugfs_add_job(hdev, job);
+	if (rc)
+		goto free_cs_object;
 
 	rc = hl_hw_queue_schedule_cs(cs);
 	if (rc) {
@@ -984,9 +1295,6 @@ free_cs_object:
 put_cs:
 	/* We finished with the CS in this function, so put the ref */
 	cs_put(cs);
-free_signal_seq_array:
-	if (cs_type == CS_TYPE_WAIT)
-		kfree(signal_seq_arr);
 free_cs_chunk_array:
 	kfree(cs_chunk_array);
 out:
@@ -995,156 +1303,39 @@ out:
 
 int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 {
-	struct hl_device *hdev = hpriv->hdev;
 	union hl_cs_args *args = data;
-	struct hl_ctx *ctx = hpriv->ctx;
-	void __user *chunks_execute, *chunks_restore;
 	enum hl_cs_type cs_type;
-	u32 num_chunks_execute, num_chunks_restore, sig_wait_flags;
 	u64 cs_seq = ULONG_MAX;
-	int rc, do_ctx_switch;
-	bool need_soft_reset = false;
-
-	if (hl_device_disabled_or_in_reset(hdev)) {
-		dev_warn_ratelimited(hdev->dev,
-			"Device is %s. Can't submit new CS\n",
-			atomic_read(&hdev->in_reset) ? "in_reset" : "disabled");
-		rc = -EBUSY;
-		goto out;
-	}
-
-	sig_wait_flags = args->in.cs_flags & HL_CS_FLAGS_SIG_WAIT;
+	void __user *chunks;
+	u32 num_chunks;
+	int rc;
 
-	if (unlikely(sig_wait_flags == HL_CS_FLAGS_SIG_WAIT)) {
-		dev_err(hdev->dev,
-			"Signal and wait CS flags are mutually exclusive, context %d\n",
-		ctx->asid);
-		rc = -EINVAL;
+	rc = hl_cs_sanity_checks(hpriv, args);
+	if (rc)
 		goto out;
-	}
 
-	if (unlikely((sig_wait_flags & HL_CS_FLAGS_SIG_WAIT) &&
-			(!hdev->supports_sync_stream))) {
-		dev_err(hdev->dev, "Sync stream CS is not supported\n");
-		rc = -EINVAL;
+	rc = hl_cs_ctx_switch(hpriv, args, &cs_seq);
+	if (rc)
 		goto out;
-	}
 
-	if (args->in.cs_flags & HL_CS_FLAGS_SIGNAL)
-		cs_type = CS_TYPE_SIGNAL;
-	else if (args->in.cs_flags & HL_CS_FLAGS_WAIT)
-		cs_type = CS_TYPE_WAIT;
-	else
-		cs_type = CS_TYPE_DEFAULT;
-
-	chunks_execute = (void __user *) (uintptr_t) args->in.chunks_execute;
-	num_chunks_execute = args->in.num_chunks_execute;
-
-	if (cs_type == CS_TYPE_DEFAULT) {
-		if (!num_chunks_execute) {
-			dev_err(hdev->dev,
-				"Got execute CS with 0 chunks, context %d\n",
-				ctx->asid);
-			rc = -EINVAL;
-			goto out;
-		}
-	} else if (num_chunks_execute != 1) {
-		dev_err(hdev->dev,
-			"Sync stream CS mandates one chunk only, context %d\n",
-			ctx->asid);
-		rc = -EINVAL;
-		goto out;
+	cs_type = hl_cs_get_cs_type(args->in.cs_flags &
+					~HL_CS_FLAGS_FORCE_RESTORE);
+	chunks = (void __user *) (uintptr_t) args->in.chunks_execute;
+	num_chunks = args->in.num_chunks_execute;
+
+	switch (cs_type) {
+	case CS_TYPE_SIGNAL:
+	case CS_TYPE_WAIT:
+	case CS_TYPE_COLLECTIVE_WAIT:
+		rc = cs_ioctl_signal_wait(hpriv, cs_type, chunks, num_chunks,
+			&cs_seq, args->in.cs_flags & HL_CS_FLAGS_TIMESTAMP);
+		break;
+	default:
+		rc = cs_ioctl_default(hpriv, chunks, num_chunks, &cs_seq,
+				args->in.cs_flags & HL_CS_FLAGS_TIMESTAMP);
+		break;
 	}
 
-	do_ctx_switch = atomic_cmpxchg(&ctx->thread_ctx_switch_token, 1, 0);
-
-	if (do_ctx_switch || (args->in.cs_flags & HL_CS_FLAGS_FORCE_RESTORE)) {
-		long ret;
-
-		chunks_restore =
-			(void __user *) (uintptr_t) args->in.chunks_restore;
-		num_chunks_restore = args->in.num_chunks_restore;
-
-		mutex_lock(&hpriv->restore_phase_mutex);
-
-		if (do_ctx_switch) {
-			rc = hdev->asic_funcs->context_switch(hdev, ctx->asid);
-			if (rc) {
-				dev_err_ratelimited(hdev->dev,
-					"Failed to switch to context %d, rejecting CS! %d\n",
-					ctx->asid, rc);
-				/*
-				 * If we timedout, or if the device is not IDLE
-				 * while we want to do context-switch (-EBUSY),
-				 * we need to soft-reset because QMAN is
-				 * probably stuck. However, we can't call to
-				 * reset here directly because of deadlock, so
-				 * need to do it at the very end of this
-				 * function
-				 */
-				if ((rc == -ETIMEDOUT) || (rc == -EBUSY))
-					need_soft_reset = true;
-				mutex_unlock(&hpriv->restore_phase_mutex);
-				goto out;
-			}
-		}
-
-		hdev->asic_funcs->restore_phase_topology(hdev);
-
-		if (!num_chunks_restore) {
-			dev_dbg(hdev->dev,
-			"Need to run restore phase but restore CS is empty\n");
-			rc = 0;
-		} else {
-			rc = cs_ioctl_default(hpriv, chunks_restore,
-						num_chunks_restore, &cs_seq);
-		}
-
-		mutex_unlock(&hpriv->restore_phase_mutex);
-
-		if (rc) {
-			dev_err(hdev->dev,
-				"Failed to submit restore CS for context %d (%d)\n",
-				ctx->asid, rc);
-			goto out;
-		}
-
-		/* Need to wait for restore completion before execution phase */
-		if (num_chunks_restore) {
-			ret = _hl_cs_wait_ioctl(hdev, ctx,
-					jiffies_to_usecs(hdev->timeout_jiffies),
-					cs_seq);
-			if (ret <= 0) {
-				dev_err(hdev->dev,
-					"Restore CS for context %d failed to complete %ld\n",
-					ctx->asid, ret);
-				rc = -ENOEXEC;
-				goto out;
-			}
-		}
-
-		ctx->thread_ctx_switch_wait_token = 1;
-	} else if (!ctx->thread_ctx_switch_wait_token) {
-		u32 tmp;
-
-		rc = hl_poll_timeout_memory(hdev,
-			&ctx->thread_ctx_switch_wait_token, tmp, (tmp == 1),
-			100, jiffies_to_usecs(hdev->timeout_jiffies), false);
-
-		if (rc == -ETIMEDOUT) {
-			dev_err(hdev->dev,
-				"context switch phase timeout (%d)\n", tmp);
-			goto out;
-		}
-	}
-
-	if (cs_type == CS_TYPE_DEFAULT)
-		rc = cs_ioctl_default(hpriv, chunks_execute, num_chunks_execute,
-					&cs_seq);
-	else
-		rc = cs_ioctl_signal_wait(hpriv, cs_type, chunks_execute,
-						num_chunks_execute, &cs_seq);
-
 out:
 	if (rc != -EAGAIN) {
 		memset(args, 0, sizeof(*args));
@@ -1152,18 +1343,20 @@ out:
 		args->out.seq = cs_seq;
 	}
 
-	if (((rc == -ETIMEDOUT) || (rc == -EBUSY)) && (need_soft_reset))
-		hl_device_reset(hdev, false, false);
-
 	return rc;
 }
 
-static long _hl_cs_wait_ioctl(struct hl_device *hdev,
-		struct hl_ctx *ctx, u64 timeout_us, u64 seq)
+static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
+				u64 timeout_us, u64 seq,
+				enum hl_cs_wait_status *status, s64 *timestamp)
 {
 	struct hl_fence *fence;
 	unsigned long timeout;
-	long rc;
+	int rc = 0;
+	long completion_rc;
+
+	if (timestamp)
+		*timestamp = 0;
 
 	if (timeout_us == MAX_SCHEDULE_TIMEOUT)
 		timeout = timeout_us;
@@ -1181,11 +1374,20 @@ static long _hl_cs_wait_ioctl(struct hl_device *hdev,
 				seq, ctx->cs_sequence);
 	} else if (fence) {
 		if (!timeout_us)
-			rc = completion_done(&fence->completion);
+			completion_rc = completion_done(&fence->completion);
 		else
-			rc = wait_for_completion_interruptible_timeout(
+			completion_rc =
+				wait_for_completion_interruptible_timeout(
 					&fence->completion, timeout);
 
+		if (completion_rc > 0) {
+			*status = CS_WAIT_STATUS_COMPLETED;
+			if (timestamp)
+				*timestamp = ktime_to_ns(fence->timestamp);
+		} else {
+			*status = CS_WAIT_STATUS_BUSY;
+		}
+
 		if (fence->error == -ETIMEDOUT)
 			rc = -ETIMEDOUT;
 		else if (fence->error == -EIO)
@@ -1196,7 +1398,7 @@ static long _hl_cs_wait_ioctl(struct hl_device *hdev,
 		dev_dbg(hdev->dev,
 			"Can't wait on seq %llu because current CS is at seq %llu (Fence is gone)\n",
 			seq, ctx->cs_sequence);
-		rc = 1;
+		*status = CS_WAIT_STATUS_GONE;
 	}
 
 	hl_ctx_put(ctx);
@@ -1208,14 +1410,17 @@ int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 {
 	struct hl_device *hdev = hpriv->hdev;
 	union hl_wait_cs_args *args = data;
+	enum hl_cs_wait_status status;
 	u64 seq = args->in.seq;
-	long rc;
+	s64 timestamp;
+	int rc;
 
-	rc = _hl_cs_wait_ioctl(hdev, hpriv->ctx, args->in.timeout_us, seq);
+	rc = _hl_cs_wait_ioctl(hdev, hpriv->ctx, args->in.timeout_us, seq,
+				&status, &timestamp);
 
 	memset(args, 0, sizeof(*args));
 
-	if (rc < 0) {
+	if (rc) {
 		if (rc == -ERESTARTSYS) {
 			dev_err_ratelimited(hdev->dev,
 				"user process got signal while waiting for CS handle %llu\n",
@@ -1236,10 +1441,23 @@ int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 		return rc;
 	}
 
-	if (rc == 0)
-		args->out.status = HL_WAIT_CS_STATUS_BUSY;
-	else
+	if (timestamp) {
+		args->out.flags |= HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD;
+		args->out.timestamp_nsec = timestamp;
+	}
+
+	switch (status) {
+	case CS_WAIT_STATUS_GONE:
+		args->out.flags |= HL_WAIT_CS_STATUS_FLAG_GONE;
+		fallthrough;
+	case CS_WAIT_STATUS_COMPLETED:
 		args->out.status = HL_WAIT_CS_STATUS_COMPLETED;
+		break;
+	case CS_WAIT_STATUS_BUSY:
+	default:
+		args->out.status = HL_WAIT_CS_STATUS_BUSY;
+		break;
+	}
 
 	return 0;
 }
diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c
index 7a59dd7c6450..f65e6559149b 100644
--- a/drivers/misc/habanalabs/common/context.c
+++ b/drivers/misc/habanalabs/common/context.c
@@ -40,10 +40,14 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
 		if ((hdev->in_debug) && (hdev->compute_ctx == ctx))
 			hl_device_set_debug_mode(hdev, false);
 
+		hdev->asic_funcs->ctx_fini(ctx);
 		hl_cb_va_pool_fini(ctx);
 		hl_vm_ctx_fini(ctx);
 		hl_asid_free(hdev, ctx->asid);
 
+		/* Scrub both SRAM and DRAM */
+		hdev->asic_funcs->scrub_device_mem(hdev, 0, 0);
+
 		if ((!hdev->pldm) && (hdev->pdev) &&
 				(!hdev->asic_funcs->is_device_idle(hdev,
 							&idle_mask, NULL)))
diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c
index 912ddfa360b1..cef716643979 100644
--- a/drivers/misc/habanalabs/common/debugfs.c
+++ b/drivers/misc/habanalabs/common/debugfs.c
@@ -22,9 +22,10 @@ static int hl_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
 				u8 i2c_reg, long *val)
 {
 	struct cpucp_packet pkt;
+	u64 result;
 	int rc;
 
-	if (hl_device_disabled_or_in_reset(hdev))
+	if (!hl_device_operational(hdev, NULL))
 		return -EBUSY;
 
 	memset(&pkt, 0, sizeof(pkt));
@@ -36,7 +37,9 @@ static int hl_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
 	pkt.i2c_reg = i2c_reg;
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-						0, val);
+						0, &result);
+
+	*val = (long) result;
 
 	if (rc)
 		dev_err(hdev->dev, "Failed to read from I2C, error %d\n", rc);
@@ -50,7 +53,7 @@ static int hl_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
 	struct cpucp_packet pkt;
 	int rc;
 
-	if (hl_device_disabled_or_in_reset(hdev))
+	if (!hl_device_operational(hdev, NULL))
 		return -EBUSY;
 
 	memset(&pkt, 0, sizeof(pkt));
@@ -76,7 +79,7 @@ static void hl_debugfs_led_set(struct hl_device *hdev, u8 led, u8 state)
 	struct cpucp_packet pkt;
 	int rc;
 
-	if (hl_device_disabled_or_in_reset(hdev))
+	if (!hl_device_operational(hdev, NULL))
 		return;
 
 	memset(&pkt, 0, sizeof(pkt));
@@ -113,7 +116,7 @@ static int command_buffers_show(struct seq_file *s, void *data)
 			"   %03llu        %d    0x%08x      %d          %d          %d\n",
 			cb->id, cb->ctx->asid, cb->size,
 			kref_read(&cb->refcount),
-			cb->mmap, cb->cs_cnt);
+			cb->mmap, atomic_read(&cb->cs_cnt));
 	}
 
 	spin_unlock(&dev_entry->cb_spinlock);
@@ -168,18 +171,19 @@ static int command_submission_jobs_show(struct seq_file *s, void *data)
 		if (first) {
 			first = false;
 			seq_puts(s, "\n");
-			seq_puts(s, " JOB ID   CS ID    CTX ASID   H/W Queue\n");
-			seq_puts(s, "---------------------------------------\n");
+			seq_puts(s, " JOB ID   CS ID    CTX ASID   JOB RefCnt   H/W Queue\n");
+			seq_puts(s, "----------------------------------------------------\n");
 		}
 		if (job->cs)
 			seq_printf(s,
-				"    %02d       %llu         %d         %d\n",
+				"   %02d      %llu        %d          %d           %d\n",
 				job->id, job->cs->sequence, job->cs->ctx->asid,
-				job->hw_queue_id);
+				kref_read(&job->refcount), job->hw_queue_id);
 		else
 			seq_printf(s,
-				"    %02d       0         %d         %d\n",
-				job->id, HL_KERNEL_ASID_ID, job->hw_queue_id);
+				"   %02d      0        %d          %d           %d\n",
+				job->id, HL_KERNEL_ASID_ID,
+				kref_read(&job->refcount), job->hw_queue_id);
 	}
 
 	spin_unlock(&dev_entry->cs_job_spinlock);
@@ -300,93 +304,15 @@ static int vm_show(struct seq_file *s, void *data)
 	return 0;
 }
 
-/* these inline functions are copied from mmu.c */
-static inline u64 get_hop0_addr(struct hl_ctx *ctx)
-{
-	return ctx->hdev->asic_prop.mmu_pgt_addr +
-			(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
-}
-
-static inline u64 get_hopN_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
-					u64 virt_addr, u64 mask, u64 shift)
-{
-	return hop_addr + ctx->hdev->asic_prop.mmu_pte_size *
-			((virt_addr & mask) >> shift);
-}
-
-static inline u64 get_hop0_pte_addr(struct hl_ctx *ctx,
-					struct hl_mmu_properties *mmu_specs,
-					u64 hop_addr, u64 vaddr)
-{
-	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_specs->hop0_mask,
-					mmu_specs->hop0_shift);
-}
-
-static inline u64 get_hop1_pte_addr(struct hl_ctx *ctx,
-					struct hl_mmu_properties *mmu_specs,
-					u64 hop_addr, u64 vaddr)
-{
-	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_specs->hop1_mask,
-					mmu_specs->hop1_shift);
-}
-
-static inline u64 get_hop2_pte_addr(struct hl_ctx *ctx,
-					struct hl_mmu_properties *mmu_specs,
-					u64 hop_addr, u64 vaddr)
-{
-	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_specs->hop2_mask,
-					mmu_specs->hop2_shift);
-}
-
-static inline u64 get_hop3_pte_addr(struct hl_ctx *ctx,
-					struct hl_mmu_properties *mmu_specs,
-					u64 hop_addr, u64 vaddr)
-{
-	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_specs->hop3_mask,
-					mmu_specs->hop3_shift);
-}
-
-static inline u64 get_hop4_pte_addr(struct hl_ctx *ctx,
-					struct hl_mmu_properties *mmu_specs,
-					u64 hop_addr, u64 vaddr)
-{
-	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_specs->hop4_mask,
-					mmu_specs->hop4_shift);
-}
-
-static inline u64 get_hop5_pte_addr(struct hl_ctx *ctx,
-					struct hl_mmu_properties *mmu_specs,
-					u64 hop_addr, u64 vaddr)
-{
-	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_specs->hop5_mask,
-					mmu_specs->hop5_shift);
-}
-
-static inline u64 get_next_hop_addr(u64 curr_pte)
-{
-	if (curr_pte & PAGE_PRESENT_MASK)
-		return curr_pte & HOP_PHYS_ADDR_MASK;
-	else
-		return ULLONG_MAX;
-}
-
 static int mmu_show(struct seq_file *s, void *data)
 {
 	struct hl_debugfs_entry *entry = s->private;
 	struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
 	struct hl_device *hdev = dev_entry->hdev;
-	struct asic_fixed_properties *prop = &hdev->asic_prop;
-	struct hl_mmu_properties *mmu_prop;
 	struct hl_ctx *ctx;
-	bool is_dram_addr;
-
-	u64 hop0_addr = 0, hop0_pte_addr = 0, hop0_pte = 0,
-		hop1_addr = 0, hop1_pte_addr = 0, hop1_pte = 0,
-		hop2_addr = 0, hop2_pte_addr = 0, hop2_pte = 0,
-		hop3_addr = 0, hop3_pte_addr = 0, hop3_pte = 0,
-		hop4_addr = 0, hop4_pte_addr = 0, hop4_pte = 0,
-		hop5_addr = 0, hop5_pte_addr = 0, hop5_pte = 0,
-		virt_addr = dev_entry->mmu_addr;
+	struct hl_mmu_hop_info hops_info;
+	u64 virt_addr = dev_entry->mmu_addr;
+	int i;
 
 	if (!hdev->mmu_enable)
 		return 0;
@@ -401,132 +327,24 @@ static int mmu_show(struct seq_file *s, void *data)
 		return 0;
 	}
 
-	is_dram_addr = hl_mem_area_inside_range(virt_addr, prop->dmmu.page_size,
-						prop->dmmu.start_addr,
-						prop->dmmu.end_addr);
-
-	/* shifts and masks are the same in PMMU and HPMMU, use one of them */
-	mmu_prop = is_dram_addr ? &prop->dmmu : &prop->pmmu;
-
-	mutex_lock(&ctx->mmu_lock);
-
-	/* the following lookup is copied from unmap() in mmu.c */
-
-	hop0_addr = get_hop0_addr(ctx);
-	hop0_pte_addr = get_hop0_pte_addr(ctx, mmu_prop, hop0_addr, virt_addr);
-	hop0_pte = hdev->asic_funcs->read_pte(hdev, hop0_pte_addr);
-	hop1_addr = get_next_hop_addr(hop0_pte);
-
-	if (hop1_addr == ULLONG_MAX)
-		goto not_mapped;
-
-	hop1_pte_addr = get_hop1_pte_addr(ctx, mmu_prop, hop1_addr, virt_addr);
-	hop1_pte = hdev->asic_funcs->read_pte(hdev, hop1_pte_addr);
-	hop2_addr = get_next_hop_addr(hop1_pte);
-
-	if (hop2_addr == ULLONG_MAX)
-		goto not_mapped;
-
-	hop2_pte_addr = get_hop2_pte_addr(ctx, mmu_prop, hop2_addr, virt_addr);
-	hop2_pte = hdev->asic_funcs->read_pte(hdev, hop2_pte_addr);
-	hop3_addr = get_next_hop_addr(hop2_pte);
-
-	if (hop3_addr == ULLONG_MAX)
-		goto not_mapped;
-
-	hop3_pte_addr = get_hop3_pte_addr(ctx, mmu_prop, hop3_addr, virt_addr);
-	hop3_pte = hdev->asic_funcs->read_pte(hdev, hop3_pte_addr);
-
-	if (mmu_prop->num_hops == MMU_ARCH_5_HOPS) {
-		if (!(hop3_pte & LAST_MASK)) {
-			hop4_addr = get_next_hop_addr(hop3_pte);
-
-			if (hop4_addr == ULLONG_MAX)
-				goto not_mapped;
-
-			hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop,
-							hop4_addr, virt_addr);
-			hop4_pte = hdev->asic_funcs->read_pte(hdev,
-								hop4_pte_addr);
-			if (!(hop4_pte & PAGE_PRESENT_MASK))
-				goto not_mapped;
-		} else {
-			if (!(hop3_pte & PAGE_PRESENT_MASK))
-				goto not_mapped;
-		}
-	} else {
-		hop4_addr = get_next_hop_addr(hop3_pte);
-
-		if (hop4_addr == ULLONG_MAX)
-			goto not_mapped;
-
-		hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop,
-						hop4_addr, virt_addr);
-		hop4_pte = hdev->asic_funcs->read_pte(hdev,
-							hop4_pte_addr);
-		if (!(hop4_pte & LAST_MASK)) {
-			hop5_addr = get_next_hop_addr(hop4_pte);
-
-			if (hop5_addr == ULLONG_MAX)
-				goto not_mapped;
-
-			hop5_pte_addr = get_hop5_pte_addr(ctx, mmu_prop,
-							hop5_addr, virt_addr);
-			hop5_pte = hdev->asic_funcs->read_pte(hdev,
-								hop5_pte_addr);
-			if (!(hop5_pte & PAGE_PRESENT_MASK))
-				goto not_mapped;
-		} else {
-			if (!(hop4_pte & PAGE_PRESENT_MASK))
-				goto not_mapped;
-		}
+	if (hl_mmu_get_tlb_info(ctx, virt_addr, &hops_info)) {
+		dev_err(hdev->dev, "virt addr 0x%llx is not mapped to phys addr\n",
+				virt_addr);
+		return 0;
 	}
 
 	seq_printf(s, "asid: %u, virt_addr: 0x%llx\n",
 			dev_entry->mmu_asid, dev_entry->mmu_addr);
 
-	seq_printf(s, "hop0_addr: 0x%llx\n", hop0_addr);
-	seq_printf(s, "hop0_pte_addr: 0x%llx\n", hop0_pte_addr);
-	seq_printf(s, "hop0_pte: 0x%llx\n", hop0_pte);
-
-	seq_printf(s, "hop1_addr: 0x%llx\n", hop1_addr);
-	seq_printf(s, "hop1_pte_addr: 0x%llx\n", hop1_pte_addr);
-	seq_printf(s, "hop1_pte: 0x%llx\n", hop1_pte);
-
-	seq_printf(s, "hop2_addr: 0x%llx\n", hop2_addr);
-	seq_printf(s, "hop2_pte_addr: 0x%llx\n", hop2_pte_addr);
-	seq_printf(s, "hop2_pte: 0x%llx\n", hop2_pte);
-
-	seq_printf(s, "hop3_addr: 0x%llx\n", hop3_addr);
-	seq_printf(s, "hop3_pte_addr: 0x%llx\n", hop3_pte_addr);
-	seq_printf(s, "hop3_pte: 0x%llx\n", hop3_pte);
-
-	if (mmu_prop->num_hops == MMU_ARCH_5_HOPS) {
-		if (!(hop3_pte & LAST_MASK)) {
-			seq_printf(s, "hop4_addr: 0x%llx\n", hop4_addr);
-			seq_printf(s, "hop4_pte_addr: 0x%llx\n", hop4_pte_addr);
-			seq_printf(s, "hop4_pte: 0x%llx\n", hop4_pte);
-		}
-	} else {
-		seq_printf(s, "hop4_addr: 0x%llx\n", hop4_addr);
-		seq_printf(s, "hop4_pte_addr: 0x%llx\n", hop4_pte_addr);
-		seq_printf(s, "hop4_pte: 0x%llx\n", hop4_pte);
-
-		if (!(hop4_pte & LAST_MASK)) {
-			seq_printf(s, "hop5_addr: 0x%llx\n", hop5_addr);
-			seq_printf(s, "hop5_pte_addr: 0x%llx\n", hop5_pte_addr);
-			seq_printf(s, "hop5_pte: 0x%llx\n", hop5_pte);
-		}
+	for (i = 0 ; i < hops_info.used_hops ; i++) {
+		seq_printf(s, "hop%d_addr: 0x%llx\n",
+				i, hops_info.hop_info[i].hop_addr);
+		seq_printf(s, "hop%d_pte_addr: 0x%llx\n",
+				i, hops_info.hop_info[i].hop_pte_addr);
+		seq_printf(s, "hop%d_pte: 0x%llx\n",
+				i, hops_info.hop_info[i].hop_pte_val);
 	}
 
-	goto out;
-
-not_mapped:
-	dev_err(hdev->dev, "virt addr 0x%llx is not mapped to phys addr\n",
-			virt_addr);
-out:
-	mutex_unlock(&ctx->mmu_lock);
-
 	return 0;
 }
 
@@ -597,7 +415,7 @@ static bool hl_is_device_va(struct hl_device *hdev, u64 addr)
 	if (!hdev->mmu_enable)
 		goto out;
 
-	if (hdev->dram_supports_virtual_memory &&
+	if (prop->dram_supports_virtual_memory &&
 		(addr >= prop->dmmu.start_addr && addr < prop->dmmu.end_addr))
 		return true;
 
@@ -616,78 +434,20 @@ static int device_va_to_pa(struct hl_device *hdev, u64 virt_addr,
 				u64 *phys_addr)
 {
 	struct hl_ctx *ctx = hdev->compute_ctx;
-	struct asic_fixed_properties *prop = &hdev->asic_prop;
-	struct hl_mmu_properties *mmu_prop;
-	u64 hop_addr, hop_pte_addr, hop_pte;
-	u64 offset_mask = HOP4_MASK | FLAGS_MASK;
 	int rc = 0;
-	bool is_dram_addr;
 
 	if (!ctx) {
 		dev_err(hdev->dev, "no ctx available\n");
 		return -EINVAL;
 	}
 
-	is_dram_addr = hl_mem_area_inside_range(virt_addr, prop->dmmu.page_size,
-						prop->dmmu.start_addr,
-						prop->dmmu.end_addr);
-
-	/* shifts and masks are the same in PMMU and HPMMU, use one of them */
-	mmu_prop = is_dram_addr ? &prop->dmmu : &prop->pmmu;
-
-	mutex_lock(&ctx->mmu_lock);
-
-	/* hop 0 */
-	hop_addr = get_hop0_addr(ctx);
-	hop_pte_addr = get_hop0_pte_addr(ctx, mmu_prop, hop_addr, virt_addr);
-	hop_pte = hdev->asic_funcs->read_pte(hdev, hop_pte_addr);
-
-	/* hop 1 */
-	hop_addr = get_next_hop_addr(hop_pte);
-	if (hop_addr == ULLONG_MAX)
-		goto not_mapped;
-	hop_pte_addr = get_hop1_pte_addr(ctx, mmu_prop, hop_addr, virt_addr);
-	hop_pte = hdev->asic_funcs->read_pte(hdev, hop_pte_addr);
-
-	/* hop 2 */
-	hop_addr = get_next_hop_addr(hop_pte);
-	if (hop_addr == ULLONG_MAX)
-		goto not_mapped;
-	hop_pte_addr = get_hop2_pte_addr(ctx, mmu_prop, hop_addr, virt_addr);
-	hop_pte = hdev->asic_funcs->read_pte(hdev, hop_pte_addr);
-
-	/* hop 3 */
-	hop_addr = get_next_hop_addr(hop_pte);
-	if (hop_addr == ULLONG_MAX)
-		goto not_mapped;
-	hop_pte_addr = get_hop3_pte_addr(ctx, mmu_prop, hop_addr, virt_addr);
-	hop_pte = hdev->asic_funcs->read_pte(hdev, hop_pte_addr);
-
-	if (!(hop_pte & LAST_MASK)) {
-		/* hop 4 */
-		hop_addr = get_next_hop_addr(hop_pte);
-		if (hop_addr == ULLONG_MAX)
-			goto not_mapped;
-		hop_pte_addr = get_hop4_pte_addr(ctx, mmu_prop, hop_addr,
-							virt_addr);
-		hop_pte = hdev->asic_funcs->read_pte(hdev, hop_pte_addr);
-
-		offset_mask = FLAGS_MASK;
+	rc = hl_mmu_va_to_pa(ctx, virt_addr, phys_addr);
+	if (rc) {
+		dev_err(hdev->dev, "virt addr 0x%llx is not mapped to phys addr\n",
+				virt_addr);
+		rc = -EINVAL;
 	}
 
-	if (!(hop_pte & PAGE_PRESENT_MASK))
-		goto not_mapped;
-
-	*phys_addr = (hop_pte & ~offset_mask) | (virt_addr & offset_mask);
-
-	goto out;
-
-not_mapped:
-	dev_err(hdev->dev, "virt addr 0x%llx is not mapped to phys addr\n",
-			virt_addr);
-	rc = -EINVAL;
-out:
-	mutex_unlock(&ctx->mmu_lock);
 	return rc;
 }
 
diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 20572224099a..5871162a8442 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -10,20 +10,9 @@
 #include "habanalabs.h"
 
 #include <linux/pci.h>
-#include <linux/sched/signal.h>
 #include <linux/hwmon.h>
 #include <uapi/misc/habanalabs.h>
 
-#define HL_PLDM_PENDING_RESET_PER_SEC	(HL_PENDING_RESET_PER_SEC * 10)
-
-bool hl_device_disabled_or_in_reset(struct hl_device *hdev)
-{
-	if ((hdev->disabled) || (atomic_read(&hdev->in_reset)))
-		return true;
-	else
-		return false;
-}
-
 enum hl_device_status hl_device_status(struct hl_device *hdev)
 {
 	enum hl_device_status status;
@@ -32,12 +21,34 @@ enum hl_device_status hl_device_status(struct hl_device *hdev)
 		status = HL_DEVICE_STATUS_MALFUNCTION;
 	else if (atomic_read(&hdev->in_reset))
 		status = HL_DEVICE_STATUS_IN_RESET;
+	else if (hdev->needs_reset)
+		status = HL_DEVICE_STATUS_NEEDS_RESET;
 	else
 		status = HL_DEVICE_STATUS_OPERATIONAL;
 
 	return status;
 }
 
+bool hl_device_operational(struct hl_device *hdev,
+		enum hl_device_status *status)
+{
+	enum hl_device_status current_status;
+
+	current_status = hl_device_status(hdev);
+	if (status)
+		*status = current_status;
+
+	switch (current_status) {
+	case HL_DEVICE_STATUS_IN_RESET:
+	case HL_DEVICE_STATUS_MALFUNCTION:
+	case HL_DEVICE_STATUS_NEEDS_RESET:
+		return false;
+	case HL_DEVICE_STATUS_OPERATIONAL:
+	default:
+		return true;
+	}
+}
+
 static void hpriv_release(struct kref *ref)
 {
 	struct hl_fpriv *hpriv;
@@ -231,16 +242,36 @@ delete_cdev_device:
 
 static void device_cdev_sysfs_del(struct hl_device *hdev)
 {
-	/* device_release() won't be called so must free devices explicitly */
-	if (!hdev->cdev_sysfs_created) {
-		kfree(hdev->dev_ctrl);
-		kfree(hdev->dev);
-		return;
-	}
+	if (!hdev->cdev_sysfs_created)
+		goto put_devices;
 
 	hl_sysfs_fini(hdev);
 	cdev_device_del(&hdev->cdev_ctrl, hdev->dev_ctrl);
 	cdev_device_del(&hdev->cdev, hdev->dev);
+
+put_devices:
+	put_device(hdev->dev);
+	put_device(hdev->dev_ctrl);
+}
+
+static void device_hard_reset_pending(struct work_struct *work)
+{
+	struct hl_device_reset_work *device_reset_work =
+		container_of(work, struct hl_device_reset_work,
+				reset_work.work);
+	struct hl_device *hdev = device_reset_work->hdev;
+	int rc;
+
+	rc = hl_device_reset(hdev, true, true);
+	if ((rc == -EBUSY) && !hdev->device_fini_pending) {
+		dev_info(hdev->dev,
+			"Could not reset device. will try again in %u seconds",
+			HL_PENDING_RESET_PER_SEC);
+
+		queue_delayed_work(device_reset_work->wq,
+			&device_reset_work->reset_work,
+			msecs_to_jiffies(HL_PENDING_RESET_PER_SEC * 1000));
+	}
 }
 
 /*
@@ -327,17 +358,32 @@ static int device_early_init(struct hl_device *hdev)
 
 	hl_cb_mgr_init(&hdev->kernel_cb_mgr);
 
+	hdev->device_reset_work.wq =
+			create_singlethread_workqueue("hl_device_reset");
+	if (!hdev->device_reset_work.wq) {
+		rc = -ENOMEM;
+		dev_err(hdev->dev, "Failed to create device reset WQ\n");
+		goto free_cb_mgr;
+	}
+
+	INIT_DELAYED_WORK(&hdev->device_reset_work.reset_work,
+			device_hard_reset_pending);
+	hdev->device_reset_work.hdev = hdev;
+	hdev->device_fini_pending = 0;
+
 	mutex_init(&hdev->send_cpu_message_lock);
 	mutex_init(&hdev->debug_lock);
 	mutex_init(&hdev->mmu_cache_lock);
-	INIT_LIST_HEAD(&hdev->hw_queues_mirror_list);
-	spin_lock_init(&hdev->hw_queues_mirror_lock);
+	INIT_LIST_HEAD(&hdev->cs_mirror_list);
+	spin_lock_init(&hdev->cs_mirror_lock);
 	INIT_LIST_HEAD(&hdev->fpriv_list);
 	mutex_init(&hdev->fpriv_list_lock);
 	atomic_set(&hdev->in_reset, 0);
 
 	return 0;
 
+free_cb_mgr:
+	hl_cb_mgr_fini(hdev, &hdev->kernel_cb_mgr);
 free_idle_busy_ts_arr:
 	kfree(hdev->idle_busy_ts_arr);
 free_chip_info:
@@ -380,6 +426,7 @@ static void device_early_fini(struct hl_device *hdev)
 	kfree(hdev->hl_chip_info);
 
 	destroy_workqueue(hdev->eq_wq);
+	destroy_workqueue(hdev->device_reset_work.wq);
 
 	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
 		destroy_workqueue(hdev->cq_wq[i]);
@@ -412,7 +459,7 @@ static void hl_device_heartbeat(struct work_struct *work)
 	struct hl_device *hdev = container_of(work, struct hl_device,
 						work_heartbeat.work);
 
-	if (hl_device_disabled_or_in_reset(hdev))
+	if (!hl_device_operational(hdev, NULL))
 		goto reschedule;
 
 	if (!hdev->asic_funcs->send_heartbeat(hdev))
@@ -758,16 +805,12 @@ disable_device:
 	return rc;
 }
 
-static int device_kill_open_processes(struct hl_device *hdev)
+static int device_kill_open_processes(struct hl_device *hdev, u32 timeout)
 {
-	u16 pending_total, pending_cnt;
 	struct hl_fpriv	*hpriv;
 	struct task_struct *task = NULL;
+	u32 pending_cnt;
 
-	if (hdev->pldm)
-		pending_total = HL_PLDM_PENDING_RESET_PER_SEC;
-	else
-		pending_total = HL_PENDING_RESET_PER_SEC;
 
 	/* Giving time for user to close FD, and for processes that are inside
 	 * hl_device_open to finish
@@ -775,6 +818,19 @@ static int device_kill_open_processes(struct hl_device *hdev)
 	if (!list_empty(&hdev->fpriv_list))
 		ssleep(1);
 
+	if (timeout) {
+		pending_cnt = timeout;
+	} else {
+		if (hdev->process_kill_trial_cnt) {
+			/* Processes have been already killed */
+			pending_cnt = 1;
+			goto wait_for_processes;
+		} else {
+			/* Wait a small period after process kill */
+			pending_cnt = HL_PENDING_RESET_PER_SEC;
+		}
+	}
+
 	mutex_lock(&hdev->fpriv_list_lock);
 
 	/* This section must be protected because we are dereferencing
@@ -794,16 +850,18 @@ static int device_kill_open_processes(struct hl_device *hdev)
 
 	mutex_unlock(&hdev->fpriv_list_lock);
 
-	/* We killed the open users, but because the driver cleans up after the
-	 * user contexts are closed (e.g. mmu mappings), we need to wait again
-	 * to make sure the cleaning phase is finished before continuing with
-	 * the reset
+	/*
+	 * We killed the open users, but that doesn't mean they are closed.
+	 * It could be that they are running a long cleanup phase in the driver
+	 * e.g. MMU unmappings, or running other long teardown flow even before
+	 * our cleanup.
+	 * Therefore we need to wait again to make sure they are closed before
+	 * continuing with the reset.
 	 */
 
-	pending_cnt = pending_total;
-
+wait_for_processes:
 	while ((!list_empty(&hdev->fpriv_list)) && (pending_cnt)) {
-		dev_info(hdev->dev,
+		dev_dbg(hdev->dev,
 			"Waiting for all unmap operations to finish before hard reset\n");
 
 		pending_cnt--;
@@ -811,18 +869,17 @@ static int device_kill_open_processes(struct hl_device *hdev)
 		ssleep(1);
 	}
 
-	return list_empty(&hdev->fpriv_list) ? 0 : -EBUSY;
-}
+	/* All processes exited successfully */
+	if (list_empty(&hdev->fpriv_list))
+		return 0;
 
-static void device_hard_reset_pending(struct work_struct *work)
-{
-	struct hl_device_reset_work *device_reset_work =
-		container_of(work, struct hl_device_reset_work, reset_work);
-	struct hl_device *hdev = device_reset_work->hdev;
+	/* Give up waiting for processes to exit */
+	if (hdev->process_kill_trial_cnt == HL_PENDING_RESET_MAX_TRIALS)
+		return -ETIME;
 
-	hl_device_reset(hdev, true, true);
+	hdev->process_kill_trial_cnt++;
 
-	kfree(device_reset_work);
+	return -EBUSY;
 }
 
 /*
@@ -859,6 +916,10 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
 		hard_reset = true;
 	}
 
+	/* Re-entry of reset thread */
+	if (from_hard_reset_thread && hdev->process_kill_trial_cnt)
+		goto kill_processes;
+
 	/*
 	 * Prevent concurrency in this function - only one reset should be
 	 * done at any given time. Only need to perform this if we didn't
@@ -904,26 +965,17 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
 
 again:
 	if ((hard_reset) && (!from_hard_reset_thread)) {
-		struct hl_device_reset_work *device_reset_work;
-
 		hdev->hard_reset_pending = true;
 
-		device_reset_work = kzalloc(sizeof(*device_reset_work),
-						GFP_ATOMIC);
-		if (!device_reset_work) {
-			rc = -ENOMEM;
-			goto out_err;
-		}
+		hdev->process_kill_trial_cnt = 0;
 
 		/*
 		 * Because the reset function can't run from interrupt or
 		 * from heartbeat work, we need to call the reset function
 		 * from a dedicated work
 		 */
-		INIT_WORK(&device_reset_work->reset_work,
-				device_hard_reset_pending);
-		device_reset_work->hdev = hdev;
-		schedule_work(&device_reset_work->reset_work);
+		queue_delayed_work(hdev->device_reset_work.wq,
+			&hdev->device_reset_work.reset_work, 0);
 
 		return 0;
 	}
@@ -949,12 +1001,25 @@ again:
 	/* Go over all the queues, release all CS and their jobs */
 	hl_cs_rollback_all(hdev);
 
+kill_processes:
 	if (hard_reset) {
 		/* Kill processes here after CS rollback. This is because the
 		 * process can't really exit until all its CSs are done, which
 		 * is what we do in cs rollback
 		 */
-		rc = device_kill_open_processes(hdev);
+		rc = device_kill_open_processes(hdev, 0);
+
+		if (rc == -EBUSY) {
+			if (hdev->device_fini_pending) {
+				dev_crit(hdev->dev,
+					"Failed to kill all open processes, stopping hard reset\n");
+				goto out_err;
+			}
+
+			/* signal reset thread to reschedule */
+			return rc;
+		}
+
 		if (rc) {
 			dev_crit(hdev->dev,
 				"Failed to kill all open processes, stopping hard reset\n");
@@ -1089,6 +1154,7 @@ again:
 	}
 
 	atomic_set(&hdev->in_reset, 0);
+	hdev->needs_reset = false;
 
 	if (hard_reset)
 		hdev->hard_reset_cnt++;
@@ -1261,13 +1327,6 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 
 	hl_debugfs_add_device(hdev);
 
-	if (hdev->asic_funcs->get_hw_state(hdev) == HL_DEVICE_HW_STATE_DIRTY) {
-		dev_info(hdev->dev,
-			"H/W state is dirty, must reset before initializing\n");
-		hdev->asic_funcs->halt_engines(hdev, true);
-		hdev->asic_funcs->hw_fini(hdev, true);
-	}
-
 	/*
 	 * From this point, in case of an error, add char devices and create
 	 * sysfs nodes as part of the error flow, to allow debugging.
@@ -1371,9 +1430,9 @@ sw_fini:
 early_fini:
 	device_early_fini(hdev);
 free_dev_ctrl:
-	kfree(hdev->dev_ctrl);
+	put_device(hdev->dev_ctrl);
 free_dev:
-	kfree(hdev->dev);
+	put_device(hdev->dev);
 out_disabled:
 	hdev->disabled = true;
 	if (add_cdev_sysfs_on_err)
@@ -1398,11 +1457,14 @@ out_disabled:
  */
 void hl_device_fini(struct hl_device *hdev)
 {
-	int i, rc;
 	ktime_t timeout;
+	int i, rc;
 
 	dev_info(hdev->dev, "Removing device\n");
 
+	hdev->device_fini_pending = 1;
+	flush_delayed_work(&hdev->device_reset_work.reset_work);
+
 	/*
 	 * This function is competing with the reset function, so try to
 	 * take the reset atomic and if we are already in middle of reset,
@@ -1458,7 +1520,11 @@ void hl_device_fini(struct hl_device *hdev)
 	 * can't really exit until all its CSs are done, which is what we
 	 * do in cs rollback
 	 */
-	rc = device_kill_open_processes(hdev);
+	dev_info(hdev->dev,
+		"Waiting for all processes to exit (timeout of %u seconds)",
+		HL_PENDING_RESET_LONG_SEC);
+
+	rc = device_kill_open_processes(hdev, HL_PENDING_RESET_LONG_SEC);
 	if (rc)
 		dev_crit(hdev->dev, "Failed to kill all open processes\n");
 
diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index cd41c7ceb0e7..0e1c629e9800 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -9,8 +9,6 @@
 #include "../include/common/hl_boot_if.h"
 
 #include <linux/firmware.h>
-#include <linux/genalloc.h>
-#include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/slab.h>
 
 #define FW_FILE_MAX_SIZE	0x1400000 /* maximum size of 20MB */
@@ -20,16 +18,18 @@
  * @hdev: pointer to hl_device structure.
  * @fw_name: the firmware image name
  * @dst: IO memory mapped address space to copy firmware to
+ * @src_offset: offset in src FW to copy from
+ * @size: amount of bytes to copy (0 to copy the whole binary)
  *
  * Copy fw code from firmware file to device memory.
  *
  * Return: 0 on success, non-zero for failure.
  */
 int hl_fw_load_fw_to_device(struct hl_device *hdev, const char *fw_name,
-				void __iomem *dst)
+				void __iomem *dst, u32 src_offset, u32 size)
 {
 	const struct firmware *fw;
-	const u64 *fw_data;
+	const void *fw_data;
 	size_t fw_size;
 	int rc;
 
@@ -57,9 +57,20 @@ int hl_fw_load_fw_to_device(struct hl_device *hdev, const char *fw_name,
 		goto out;
 	}
 
-	fw_data = (const u64 *) fw->data;
+	if (size - src_offset > fw_size) {
+		dev_err(hdev->dev,
+			"size to copy(%u) and offset(%u) are invalid\n",
+			size, src_offset);
+		rc = -EINVAL;
+		goto out;
+	}
+
+	if (size)
+		fw_size = size;
+
+	fw_data = (const void *) fw->data;
 
-	memcpy_toio(dst, fw_data, fw_size);
+	memcpy_toio(dst, fw_data + src_offset, fw_size);
 
 out:
 	release_firmware(fw);
@@ -77,7 +88,7 @@ int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode)
 }
 
 int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
-				u16 len, u32 timeout, long *result)
+				u16 len, u32 timeout, u64 *result)
 {
 	struct cpucp_packet *pkt;
 	dma_addr_t pkt_dma_addr;
@@ -132,7 +143,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
 						>> CPUCP_PKT_CTL_OPCODE_SHIFT);
 		rc = -EIO;
 	} else if (result) {
-		*result = (long) le64_to_cpu(pkt->result);
+		*result = le64_to_cpu(pkt->result);
 	}
 
 out:
@@ -146,7 +157,7 @@ out:
 int hl_fw_unmask_irq(struct hl_device *hdev, u16 event_type)
 {
 	struct cpucp_packet pkt;
-	long result;
+	u64 result;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
@@ -169,7 +180,7 @@ int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr,
 {
 	struct cpucp_unmask_irq_arr_packet *pkt;
 	size_t total_pkt_size;
-	long result;
+	u64 result;
 	int rc;
 
 	total_pkt_size = sizeof(struct cpucp_unmask_irq_arr_packet) +
@@ -208,7 +219,7 @@ int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr,
 int hl_fw_test_cpu_queue(struct hl_device *hdev)
 {
 	struct cpucp_packet test_pkt = {};
-	long result;
+	u64 result;
 	int rc;
 
 	test_pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEST <<
@@ -221,7 +232,7 @@ int hl_fw_test_cpu_queue(struct hl_device *hdev)
 	if (!rc) {
 		if (result != CPUCP_PACKET_FENCE_VAL)
 			dev_err(hdev->dev,
-				"CPU queue test failed (0x%08lX)\n", result);
+				"CPU queue test failed (%#08llx)\n", result);
 	} else {
 		dev_err(hdev->dev, "CPU queue test failed, error %d\n", rc);
 	}
@@ -252,7 +263,7 @@ void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
 int hl_fw_send_heartbeat(struct hl_device *hdev)
 {
 	struct cpucp_packet hb_pkt = {};
-	long result;
+	u64 result;
 	int rc;
 
 	hb_pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEST <<
@@ -268,13 +279,14 @@ int hl_fw_send_heartbeat(struct hl_device *hdev)
 	return rc;
 }
 
-int hl_fw_cpucp_info_get(struct hl_device *hdev)
+int hl_fw_cpucp_info_get(struct hl_device *hdev,
+			u32 cpu_security_boot_status_reg)
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	struct cpucp_packet pkt = {};
 	void *cpucp_info_cpu_addr;
 	dma_addr_t cpucp_info_dma_addr;
-	long result;
+	u64 result;
 	int rc;
 
 	cpucp_info_cpu_addr =
@@ -313,6 +325,11 @@ int hl_fw_cpucp_info_get(struct hl_device *hdev)
 		goto out;
 	}
 
+	/* Read FW application security bits again */
+	if (hdev->asic_prop.fw_security_status_valid)
+		hdev->asic_prop.fw_app_security_map =
+				RREG32(cpu_security_boot_status_reg);
+
 out:
 	hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev,
 			sizeof(struct cpucp_info), cpucp_info_cpu_addr);
@@ -325,7 +342,7 @@ int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size)
 	struct cpucp_packet pkt = {};
 	void *eeprom_info_cpu_addr;
 	dma_addr_t eeprom_info_dma_addr;
-	long result;
+	u64 result;
 	int rc;
 
 	eeprom_info_cpu_addr =
@@ -368,7 +385,7 @@ int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev,
 		struct hl_info_pci_counters *counters)
 {
 	struct cpucp_packet pkt = {};
-	long result;
+	u64 result;
 	int rc;
 
 	pkt.ctl = cpu_to_le32(CPUCP_PACKET_PCIE_THROUGHPUT_GET <<
@@ -415,7 +432,7 @@ int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev,
 int hl_fw_cpucp_total_energy_get(struct hl_device *hdev, u64 *total_energy)
 {
 	struct cpucp_packet pkt = {};
-	long result;
+	u64 result;
 	int rc;
 
 	pkt.ctl = cpu_to_le32(CPUCP_PACKET_TOTAL_ENERGY_GET <<
@@ -435,9 +452,36 @@ int hl_fw_cpucp_total_energy_get(struct hl_device *hdev, u64 *total_energy)
 	return rc;
 }
 
-static void fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg)
+int hl_fw_cpucp_pll_info_get(struct hl_device *hdev, u16 pll_index,
+		u16 *pll_freq_arr)
 {
-	u32 err_val;
+	struct cpucp_packet pkt;
+	u64 result;
+	int rc;
+
+	memset(&pkt, 0, sizeof(pkt));
+
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_PLL_INFO_GET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.pll_type = __cpu_to_le16(pll_index);
+
+	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
+			HL_CPUCP_INFO_TIMEOUT_USEC, &result);
+	if (rc)
+		dev_err(hdev->dev, "Failed to read PLL info, error %d\n", rc);
+
+	pll_freq_arr[0] = FIELD_GET(CPUCP_PKT_RES_PLL_OUT0_MASK, result);
+	pll_freq_arr[1] = FIELD_GET(CPUCP_PKT_RES_PLL_OUT1_MASK, result);
+	pll_freq_arr[2] = FIELD_GET(CPUCP_PKT_RES_PLL_OUT2_MASK, result);
+	pll_freq_arr[3] = FIELD_GET(CPUCP_PKT_RES_PLL_OUT3_MASK, result);
+
+	return rc;
+}
+
+static void fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg,
+		u32 cpu_security_boot_status_reg)
+{
+	u32 err_val, security_val;
 
 	/* Some of the firmware status codes are deprecated in newer f/w
 	 * versions. In those versions, the errors are reported
@@ -472,6 +516,18 @@ static void fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg)
 	if (err_val & CPU_BOOT_ERR0_NIC_FW_FAIL)
 		dev_err(hdev->dev,
 			"Device boot error - NIC F/W initialization failed\n");
+	if (err_val & CPU_BOOT_ERR0_SECURITY_NOT_RDY)
+		dev_warn(hdev->dev,
+			"Device boot warning - security not ready\n");
+	if (err_val & CPU_BOOT_ERR0_SECURITY_FAIL)
+		dev_err(hdev->dev, "Device boot error - security failure\n");
+	if (err_val & CPU_BOOT_ERR0_EFUSE_FAIL)
+		dev_err(hdev->dev, "Device boot error - eFuse failure\n");
+
+	security_val = RREG32(cpu_security_boot_status_reg);
+	if (security_val & CPU_BOOT_DEV_STS0_ENABLED)
+		dev_dbg(hdev->dev, "Device security status %#x\n",
+				security_val);
 }
 
 static void detect_cpu_boot_status(struct hl_device *hdev, u32 status)
@@ -524,10 +580,12 @@ static void detect_cpu_boot_status(struct hl_device *hdev, u32 status)
 	}
 }
 
-int hl_fw_read_preboot_ver(struct hl_device *hdev, u32 cpu_boot_status_reg,
-				u32 boot_err0_reg, u32 timeout)
+int hl_fw_read_preboot_status(struct hl_device *hdev, u32 cpu_boot_status_reg,
+		u32 cpu_security_boot_status_reg, u32 boot_err0_reg,
+		u32 timeout)
 {
-	u32 status;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	u32 status, security_status;
 	int rc;
 
 	if (!hdev->cpu_enable)
@@ -557,23 +615,52 @@ int hl_fw_read_preboot_ver(struct hl_device *hdev, u32 cpu_boot_status_reg,
 	if (rc) {
 		dev_err(hdev->dev, "Failed to read preboot version\n");
 		detect_cpu_boot_status(hdev, status);
-		fw_read_errors(hdev, boot_err0_reg);
+		fw_read_errors(hdev, boot_err0_reg,
+				cpu_security_boot_status_reg);
 		return -EIO;
 	}
 
-	hdev->asic_funcs->read_device_fw_version(hdev, FW_COMP_PREBOOT);
+	rc = hdev->asic_funcs->read_device_fw_version(hdev, FW_COMP_PREBOOT);
+	if (rc)
+		return rc;
+
+	security_status = RREG32(cpu_security_boot_status_reg);
+
+	/* We read security status multiple times during boot:
+	 * 1. preboot - we check if fw security feature is supported
+	 * 2. boot cpu - we get boot cpu security status
+	 * 3. FW application - we get FW application security status
+	 *
+	 * Preboot:
+	 * Check security status bit (CPU_BOOT_DEV_STS0_ENABLED), if it is set
+	 * check security enabled bit (CPU_BOOT_DEV_STS0_SECURITY_EN)
+	 */
+	if (security_status & CPU_BOOT_DEV_STS0_ENABLED) {
+		hdev->asic_prop.fw_security_status_valid = 1;
+		prop->fw_security_disabled =
+			!(security_status & CPU_BOOT_DEV_STS0_SECURITY_EN);
+	} else {
+		hdev->asic_prop.fw_security_status_valid = 0;
+		prop->fw_security_disabled = true;
+	}
+
+	dev_info(hdev->dev, "firmware-level security is %s\n",
+		prop->fw_security_disabled ? "disabled" : "enabled");
 
 	return 0;
 }
 
 int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
 			u32 msg_to_cpu_reg, u32 cpu_msg_status_reg,
-			u32 boot_err0_reg, bool skip_bmc,
-			u32 cpu_timeout, u32 boot_fit_timeout)
+			u32 cpu_security_boot_status_reg, u32 boot_err0_reg,
+			bool skip_bmc, u32 cpu_timeout, u32 boot_fit_timeout)
 {
 	u32 status;
 	int rc;
 
+	if (!(hdev->fw_loading & FW_TYPE_BOOT_CPU))
+		return 0;
+
 	dev_info(hdev->dev, "Going to wait for device boot (up to %lds)\n",
 		cpu_timeout / USEC_PER_SEC);
 
@@ -631,17 +718,24 @@ int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
 		10000,
 		cpu_timeout);
 
+	dev_dbg(hdev->dev, "uboot status = %d\n", status);
+
 	/* Read U-Boot version now in case we will later fail */
 	hdev->asic_funcs->read_device_fw_version(hdev, FW_COMP_UBOOT);
 
+	/* Read boot_cpu security bits */
+	if (hdev->asic_prop.fw_security_status_valid)
+		hdev->asic_prop.fw_boot_cpu_security_map =
+				RREG32(cpu_security_boot_status_reg);
+
 	if (rc) {
 		detect_cpu_boot_status(hdev, status);
 		rc = -EIO;
 		goto out;
 	}
 
-	if (!hdev->fw_loading) {
-		dev_info(hdev->dev, "Skip loading FW\n");
+	if (!(hdev->fw_loading & FW_TYPE_LINUX)) {
+		dev_info(hdev->dev, "Skip loading Linux F/W\n");
 		goto out;
 	}
 
@@ -702,10 +796,23 @@ int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
 		goto out;
 	}
 
+	/* Read FW application security bits */
+	if (hdev->asic_prop.fw_security_status_valid) {
+		hdev->asic_prop.fw_app_security_map =
+				RREG32(cpu_security_boot_status_reg);
+
+		if (hdev->asic_prop.fw_app_security_map &
+				CPU_BOOT_DEV_STS0_FW_HARD_RST_EN)
+			hdev->asic_prop.hard_reset_done_by_fw = true;
+	}
+
+	dev_dbg(hdev->dev, "Firmware hard-reset is %s\n",
+		hdev->asic_prop.hard_reset_done_by_fw ? "enabled" : "disabled");
+
 	dev_info(hdev->dev, "Successfully loaded firmware to device\n");
 
 out:
-	fw_read_errors(hdev, boot_err0_reg);
+	fw_read_errors(hdev, boot_err0_reg, cpu_security_boot_status_reg);
 
 	return rc;
 }
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 6ed974d2def0..571eda6ef5ab 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -10,6 +10,7 @@
 
 #include "../include/common/cpucp_if.h"
 #include "../include/common/qman_if.h"
+#include "../include/hw_ip/mmu/mmu_general.h"
 #include <uapi/misc/habanalabs.h>
 
 #include <linux/cdev.h>
@@ -19,6 +20,10 @@
 #include <linux/scatterlist.h>
 #include <linux/hashtable.h>
 #include <linux/bitfield.h>
+#include <linux/genalloc.h>
+#include <linux/sched/signal.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
+#include <linux/coresight.h>
 
 #define HL_NAME				"habanalabs"
 
@@ -36,7 +41,9 @@
 #define HL_MMAP_OFFSET_VALUE_MASK	(0x3FFFFFFFFFFFull >> PAGE_SHIFT)
 #define HL_MMAP_OFFSET_VALUE_GET(off)	(off & HL_MMAP_OFFSET_VALUE_MASK)
 
-#define HL_PENDING_RESET_PER_SEC	30
+#define HL_PENDING_RESET_PER_SEC	10
+#define HL_PENDING_RESET_MAX_TRIALS	60 /* 10 minutes */
+#define HL_PENDING_RESET_LONG_SEC	60
 
 #define HL_HARD_RESET_MAX_TIMEOUT	120
 
@@ -61,15 +68,29 @@
 /* MMU */
 #define MMU_HASH_TABLE_BITS		7 /* 1 << 7 buckets */
 
+/**
+ * enum hl_mmu_page_table_locaion - mmu page table location
+ * @MMU_DR_PGT: page-table is located on device DRAM.
+ * @MMU_HR_PGT: page-table is located on host memory.
+ * @MMU_NUM_PGT_LOCATIONS: number of page-table locations currently supported.
+ */
+enum hl_mmu_page_table_location {
+	MMU_DR_PGT = 0,		/* device-dram-resident MMU PGT */
+	MMU_HR_PGT,		/* host resident MMU PGT */
+	MMU_NUM_PGT_LOCATIONS	/* num of PGT locations */
+};
+
 /*
  * HL_RSVD_SOBS 'sync stream' reserved sync objects per QMAN stream
  * HL_RSVD_MONS 'sync stream' reserved monitors per QMAN stream
  */
-#define HL_RSVD_SOBS			4
-#define HL_RSVD_MONS			2
+#define HL_RSVD_SOBS			2
+#define HL_RSVD_MONS			1
 
-#define HL_RSVD_SOBS_IN_USE		2
-#define HL_RSVD_MONS_IN_USE		1
+/*
+ * HL_COLLECTIVE_RSVD_MSTR_MONS 'collective' reserved monitors per QMAN stream
+ */
+#define HL_COLLECTIVE_RSVD_MSTR_MONS	2
 
 #define HL_MAX_SOB_VAL			(1 << 15)
 
@@ -80,6 +101,28 @@
 
 #define HL_MAX_DCORES			4
 
+#define HL_MAX_SOBS_PER_MONITOR	8
+
+/**
+ * struct hl_gen_wait_properties - properties for generating a wait CB
+ * @data: command buffer
+ * @q_idx: queue id is used to extract fence register address
+ * @size: offset in command buffer
+ * @sob_base: SOB base to use in this wait CB
+ * @sob_val: SOB value to wait for
+ * @mon_id: monitor to use in this wait CB
+ * @sob_mask: each bit represents a SOB offset from sob_base to be used
+ */
+struct hl_gen_wait_properties {
+	void	*data;
+	u32	q_idx;
+	u32	size;
+	u16	sob_base;
+	u16	sob_val;
+	u16	mon_id;
+	u8	sob_mask;
+};
+
 /**
  * struct pgt_info - MMU hop page info.
  * @node: hash linked-list node for the pgts shadow hash of pgts.
@@ -125,6 +168,18 @@ enum hl_fw_component {
 };
 
 /**
+ * enum hl_fw_types - F/W types to load
+ * @FW_TYPE_LINUX: Linux image for device CPU
+ * @FW_TYPE_BOOT_CPU: Boot image for device CPU
+ * @FW_TYPE_ALL_TYPES: Mask for all types
+ */
+enum hl_fw_types {
+	FW_TYPE_LINUX = 0x1,
+	FW_TYPE_BOOT_CPU = 0x2,
+	FW_TYPE_ALL_TYPES = (FW_TYPE_LINUX | FW_TYPE_BOOT_CPU)
+};
+
+/**
  * enum hl_queue_type - Supported QUEUE types.
  * @QUEUE_TYPE_NA: queue is not available.
  * @QUEUE_TYPE_EXT: external queue which is a DMA channel that may access the
@@ -146,7 +201,8 @@ enum hl_queue_type {
 enum hl_cs_type {
 	CS_TYPE_DEFAULT,
 	CS_TYPE_SIGNAL,
-	CS_TYPE_WAIT
+	CS_TYPE_WAIT,
+	CS_TYPE_COLLECTIVE_WAIT
 };
 
 /*
@@ -176,6 +232,17 @@ struct hl_outbound_pci_region {
 };
 
 /*
+ * enum queue_cb_alloc_flags - Indicates queue support for CBs that
+ * allocated by Kernel or by User
+ * @CB_ALLOC_KERNEL: support only CBs that allocated by Kernel
+ * @CB_ALLOC_USER: support only CBs that allocated by User
+ */
+enum queue_cb_alloc_flags {
+	CB_ALLOC_KERNEL = 0x1,
+	CB_ALLOC_USER   = 0x2
+};
+
+/*
  * struct hl_hw_sob - H/W SOB info.
  * @hdev: habanalabs device structure.
  * @kref: refcount of this SOB. The SOB will reset once the refcount is zero.
@@ -189,19 +256,29 @@ struct hl_hw_sob {
 	u32			q_idx;
 };
 
+enum hl_collective_mode {
+	HL_COLLECTIVE_NOT_SUPPORTED = 0x0,
+	HL_COLLECTIVE_MASTER = 0x1,
+	HL_COLLECTIVE_SLAVE = 0x2
+};
+
 /**
  * struct hw_queue_properties - queue information.
  * @type: queue type.
+ * @queue_cb_alloc_flags: bitmap which indicates if the hw queue supports CB
+ *                        that allocated by the Kernel driver and therefore,
+ *                        a CB handle can be provided for jobs on this queue.
+ *                        Otherwise, a CB address must be provided.
+ * @collective_mode: collective mode of current queue
  * @driver_only: true if only the driver is allowed to send a job to this queue,
  *               false otherwise.
- * @requires_kernel_cb: true if a CB handle must be provided for jobs on this
- *                      queue, false otherwise (a CB address must be provided).
  * @supports_sync_stream: True if queue supports sync stream
  */
 struct hw_queue_properties {
 	enum hl_queue_type	type;
+	enum queue_cb_alloc_flags cb_alloc_flags;
+	enum hl_collective_mode	collective_mode;
 	u8			driver_only;
-	u8			requires_kernel_cb;
 	u8			supports_sync_stream;
 };
 
@@ -227,6 +304,8 @@ enum hl_device_hw_state {
 	HL_DEVICE_HW_STATE_DIRTY
 };
 
+#define HL_MMU_VA_ALIGNMENT_NOT_NEEDED 0
+
 /**
  * struct hl_mmu_properties - ASIC specific MMU address translation properties.
  * @start_addr: virtual start address of the memory region.
@@ -245,6 +324,8 @@ enum hl_device_hw_state {
  * @hop5_mask: mask to get the PTE address in hop 5.
  * @page_size: default page size used to allocate memory.
  * @num_hops: The amount of hops supported by the translation table.
+ * @host_resident: Should the MMU page table reside in host memory or in the
+ *                 device DRAM.
  */
 struct hl_mmu_properties {
 	u64	start_addr;
@@ -263,6 +344,7 @@ struct hl_mmu_properties {
 	u64	hop5_mask;
 	u32	page_size;
 	u32	num_hops;
+	u8	host_resident;
 };
 
 /**
@@ -314,6 +396,14 @@ struct hl_mmu_properties {
  * @cb_pool_cb_size: size of each CB in the CB pool.
  * @max_pending_cs: maximum of concurrent pending command submissions
  * @max_queues: maximum amount of queues in the system
+ * @fw_boot_cpu_security_map: bitmap representation of boot cpu security status
+ *                            reported by FW, bit description can be found in
+ *                            CPU_BOOT_DEV_STS*
+ * @fw_app_security_map: bitmap representation of application security status
+ *                       reported by FW, bit description can be found in
+ *                       CPU_BOOT_DEV_STS*
+ * @collective_first_sob: first sync object available for collective use
+ * @collective_first_mon: first monitor available for collective use
  * @sync_stream_first_sob: first sync object available for sync stream use
  * @sync_stream_first_mon: first monitor available for sync stream use
  * @first_available_user_sob: first sob available for the user
@@ -322,6 +412,10 @@ struct hl_mmu_properties {
  * @completion_queues_count: number of completion queues.
  * @fw_security_disabled: true if security measures are disabled in firmware,
  *                        false otherwise
+ * @fw_security_status_valid: security status bits are valid and can be fetched
+ *                            from BOOT_DEV_STS0
+ * @dram_supports_virtual_memory: is there an MMU towards the DRAM
+ * @hard_reset_done_by_fw: true if firmware is handling hard reset flow
  */
 struct asic_fixed_properties {
 	struct hw_queue_properties	*hw_queues_props;
@@ -366,6 +460,10 @@ struct asic_fixed_properties {
 	u32				cb_pool_cb_size;
 	u32				max_pending_cs;
 	u32				max_queues;
+	u32				fw_boot_cpu_security_map;
+	u32				fw_app_security_map;
+	u16				collective_first_sob;
+	u16				collective_first_mon;
 	u16				sync_stream_first_sob;
 	u16				sync_stream_first_mon;
 	u16				first_available_user_sob[HL_MAX_DCORES];
@@ -373,6 +471,9 @@ struct asic_fixed_properties {
 	u8				tpc_enabled_mask;
 	u8				completion_queues_count;
 	u8				fw_security_disabled;
+	u8				fw_security_status_valid;
+	u8				dram_supports_virtual_memory;
+	u8				hard_reset_done_by_fw;
 };
 
 /**
@@ -380,12 +481,14 @@ struct asic_fixed_properties {
  * @completion: fence is implemented using completion
  * @refcount: refcount for this fence
  * @error: mark this fence with error
+ * @timestamp: timestamp upon completion
  *
  */
 struct hl_fence {
 	struct completion	completion;
 	struct kref		refcount;
 	int			error;
+	ktime_t			timestamp;
 };
 
 /**
@@ -397,6 +500,7 @@ struct hl_fence {
  * @cs_seq: command submission sequence number.
  * @type: type of the CS - signal/wait.
  * @sob_val: the SOB value that is used in this signal/wait CS.
+ * @sob_group: the SOB group that is used in this collective wait CS.
  */
 struct hl_cs_compl {
 	struct hl_fence		base_fence;
@@ -406,6 +510,7 @@ struct hl_cs_compl {
 	u64			cs_seq;
 	enum hl_cs_type		type;
 	u16			sob_val;
+	u16			sob_group;
 };
 
 /*
@@ -427,7 +532,7 @@ struct hl_cb_mgr {
  * @refcount: reference counter for usage of the CB.
  * @hdev: pointer to device this CB belongs to.
  * @ctx: pointer to the CB owner's context.
- * @lock: spinlock to protect mmap/cs flows.
+ * @lock: spinlock to protect mmap flows.
  * @debugfs_list: node in debugfs list of command buffers.
  * @pool_list: node in pool list of command buffers.
  * @va_block_list: list of virtual addresses blocks of the CB if it is mapped to
@@ -456,7 +561,7 @@ struct hl_cb {
 	dma_addr_t		bus_address;
 	u32			mmap_size;
 	u32			size;
-	u32			cs_cnt;
+	atomic_t		cs_cnt;
 	u8			mmap;
 	u8			is_pool;
 	u8			is_internal;
@@ -468,6 +573,7 @@ struct hl_cb {
  * QUEUES
  */
 
+struct hl_cs;
 struct hl_cs_job;
 
 /* Queue length of external and HW queues */
@@ -490,10 +596,38 @@ struct hl_cs_job;
 #define HL_CPU_ACCESSIBLE_MEM_SIZE	SZ_2M
 
 /**
- * struct hl_hw_queue - describes a H/W transport queue.
+ * struct hl_sync_stream_properties -
+ *     describes a H/W queue sync stream properties
  * @hw_sob: array of the used H/W SOBs by this H/W queue.
+ * @next_sob_val: the next value to use for the currently used SOB.
+ * @base_sob_id: the base SOB id of the SOBs used by this queue.
+ * @base_mon_id: the base MON id of the MONs used by this queue.
+ * @collective_mstr_mon_id: the MON ids of the MONs used by this master queue
+ *                          in order to sync with all slave queues.
+ * @collective_slave_mon_id: the MON id used by this slave queue in order to
+ *                           sync with its master queue.
+ * @collective_sob_id: current SOB id used by this collective slave queue
+ *                     to signal its collective master queue upon completion.
+ * @curr_sob_offset: the id offset to the currently used SOB from the
+ *                   HL_RSVD_SOBS that are being used by this queue.
+ */
+struct hl_sync_stream_properties {
+	struct hl_hw_sob hw_sob[HL_RSVD_SOBS];
+	u16		next_sob_val;
+	u16		base_sob_id;
+	u16		base_mon_id;
+	u16		collective_mstr_mon_id[HL_COLLECTIVE_RSVD_MSTR_MONS];
+	u16		collective_slave_mon_id;
+	u16		collective_sob_id;
+	u8		curr_sob_offset;
+};
+
+/**
+ * struct hl_hw_queue - describes a H/W transport queue.
  * @shadow_queue: pointer to a shadow queue that holds pointers to jobs.
+ * @sync_stream_prop: sync stream queue properties
  * @queue_type: type of queue.
+ * @collective_mode: collective mode of current queue
  * @kernel_address: holds the queue's kernel virtual address.
  * @bus_address: holds the queue's DMA address.
  * @pi: holds the queue's pi value.
@@ -502,33 +636,25 @@ struct hl_cs_job;
  * @cq_id: the id for the corresponding CQ for this H/W queue.
  * @msi_vec: the IRQ number of the H/W queue.
  * @int_queue_len: length of internal queue (number of entries).
- * @next_sob_val: the next value to use for the currently used SOB.
- * @base_sob_id: the base SOB id of the SOBs used by this queue.
- * @base_mon_id: the base MON id of the MONs used by this queue.
  * @valid: is the queue valid (we have array of 32 queues, not all of them
  *         exist).
- * @curr_sob_offset: the id offset to the currently used SOB from the
- *                   HL_RSVD_SOBS that are being used by this queue.
  * @supports_sync_stream: True if queue supports sync stream
  */
 struct hl_hw_queue {
-	struct hl_hw_sob	hw_sob[HL_RSVD_SOBS];
-	struct hl_cs_job	**shadow_queue;
-	enum hl_queue_type	queue_type;
-	void			*kernel_address;
-	dma_addr_t		bus_address;
-	u32			pi;
-	atomic_t		ci;
-	u32			hw_queue_id;
-	u32			cq_id;
-	u32			msi_vec;
-	u16			int_queue_len;
-	u16			next_sob_val;
-	u16			base_sob_id;
-	u16			base_mon_id;
-	u8			valid;
-	u8			curr_sob_offset;
-	u8			supports_sync_stream;
+	struct hl_cs_job			**shadow_queue;
+	struct hl_sync_stream_properties	sync_stream_prop;
+	enum hl_queue_type			queue_type;
+	enum hl_collective_mode			collective_mode;
+	void					*kernel_address;
+	dma_addr_t				bus_address;
+	u32					pi;
+	atomic_t				ci;
+	u32					hw_queue_id;
+	u32					cq_id;
+	u32					msi_vec;
+	u16					int_queue_len;
+	u8					valid;
+	u8					supports_sync_stream;
 };
 
 /**
@@ -650,6 +776,7 @@ enum div_select_defs {
  *                           dma_free_coherent(). This is ASIC function because
  *                           its implementation is not trivial when the driver
  *                           is loaded in simulation mode (not upstreamed).
+ * @scrub_device_mem: Scrub device memory given an address and size
  * @get_int_queue_base: get the internal queue base address.
  * @test_queues: run simple test on all queues for sanity check.
  * @asic_dma_pool_zalloc: small DMA allocation of coherent memory from DMA pool.
@@ -700,6 +827,7 @@ enum div_select_defs {
  * @wreg: Write a register. Needed for simulator support.
  * @halt_coresight: stop the ETF and ETR traces.
  * @ctx_init: context dependent initialization.
+ * @ctx_fini: context dependent cleanup.
  * @get_clk_rate: Retrieve the ASIC current and maximum clock rate in MHz
  * @get_queue_id_for_cq: Get the H/W queue id related to the given CQ index.
  * @read_device_fw_version: read the device's firmware versions that are
@@ -711,9 +839,13 @@ enum div_select_defs {
  * @gen_signal_cb: Generate a signal CB.
  * @gen_wait_cb: Generate a wait CB.
  * @reset_sob: Reset a SOB.
+ * @reset_sob_group: Reset SOB group
  * @set_dma_mask_from_fw: set the DMA mask in the driver according to the
  *                        firmware configuration
  * @get_device_time: Get the device time.
+ * @collective_wait_init_cs: Generate collective master/slave packets
+ *                           and place them in the relevant cs jobs
+ * @collective_wait_create_jobs: allocate collective wait cs jobs
  */
 struct hl_asic_funcs {
 	int (*early_init)(struct hl_device *hdev);
@@ -736,6 +868,7 @@ struct hl_asic_funcs {
 					dma_addr_t *dma_handle, gfp_t flag);
 	void (*asic_dma_free_coherent)(struct hl_device *hdev, size_t size,
 					void *cpu_addr, dma_addr_t dma_handle);
+	int (*scrub_device_mem)(struct hl_device *hdev, u64 addr, u64 size);
 	void* (*get_int_queue_base)(struct hl_device *hdev, u32 queue_id,
 				dma_addr_t *dma_handle, u16 *queue_len);
 	int (*test_queues)(struct hl_device *hdev);
@@ -794,28 +927,34 @@ struct hl_asic_funcs {
 	int (*get_eeprom_data)(struct hl_device *hdev, void *data,
 				size_t max_size);
 	int (*send_cpu_message)(struct hl_device *hdev, u32 *msg,
-				u16 len, u32 timeout, long *result);
-	enum hl_device_hw_state (*get_hw_state)(struct hl_device *hdev);
+				u16 len, u32 timeout, u64 *result);
 	int (*pci_bars_map)(struct hl_device *hdev);
 	int (*init_iatu)(struct hl_device *hdev);
 	u32 (*rreg)(struct hl_device *hdev, u32 reg);
 	void (*wreg)(struct hl_device *hdev, u32 reg, u32 val);
 	void (*halt_coresight)(struct hl_device *hdev);
 	int (*ctx_init)(struct hl_ctx *ctx);
+	void (*ctx_fini)(struct hl_ctx *ctx);
 	int (*get_clk_rate)(struct hl_device *hdev, u32 *cur_clk, u32 *max_clk);
 	u32 (*get_queue_id_for_cq)(struct hl_device *hdev, u32 cq_idx);
-	void (*read_device_fw_version)(struct hl_device *hdev,
+	int (*read_device_fw_version)(struct hl_device *hdev,
 					enum hl_fw_component fwc);
 	int (*load_firmware_to_device)(struct hl_device *hdev);
 	int (*load_boot_fit_to_device)(struct hl_device *hdev);
 	u32 (*get_signal_cb_size)(struct hl_device *hdev);
 	u32 (*get_wait_cb_size)(struct hl_device *hdev);
-	void (*gen_signal_cb)(struct hl_device *hdev, void *data, u16 sob_id);
-	void (*gen_wait_cb)(struct hl_device *hdev, void *data, u16 sob_id,
-				u16 sob_val, u16 mon_id, u32 q_idx);
+	u32 (*gen_signal_cb)(struct hl_device *hdev, void *data, u16 sob_id,
+			u32 size);
+	u32 (*gen_wait_cb)(struct hl_device *hdev,
+			struct hl_gen_wait_properties *prop);
 	void (*reset_sob)(struct hl_device *hdev, void *data);
+	void (*reset_sob_group)(struct hl_device *hdev, u16 sob_group);
 	void (*set_dma_mask_from_fw)(struct hl_device *hdev);
 	u64 (*get_device_time)(struct hl_device *hdev);
+	void (*collective_wait_init_cs)(struct hl_cs *cs);
+	int (*collective_wait_create_jobs)(struct hl_device *hdev,
+			struct hl_ctx *ctx, struct hl_cs *cs, u32 wait_queue_id,
+			u32 collective_engine_id);
 };
 
 
@@ -826,17 +965,48 @@ struct hl_asic_funcs {
 #define HL_KERNEL_ASID_ID	0
 
 /**
+ * enum hl_va_range_type - virtual address range type.
+ * @HL_VA_RANGE_TYPE_HOST: range type of host pages
+ * @HL_VA_RANGE_TYPE_HOST_HUGE: range type of host huge pages
+ * @HL_VA_RANGE_TYPE_DRAM: range type of dram pages
+ */
+enum hl_va_range_type {
+	HL_VA_RANGE_TYPE_HOST,
+	HL_VA_RANGE_TYPE_HOST_HUGE,
+	HL_VA_RANGE_TYPE_DRAM,
+	HL_VA_RANGE_TYPE_MAX
+};
+
+/**
  * struct hl_va_range - virtual addresses range.
  * @lock: protects the virtual addresses list.
  * @list: list of virtual addresses blocks available for mappings.
  * @start_addr: range start address.
  * @end_addr: range end address.
+ * @page_size: page size of this va range.
  */
 struct hl_va_range {
 	struct mutex		lock;
 	struct list_head	list;
 	u64			start_addr;
 	u64			end_addr;
+	u32			page_size;
+};
+
+/**
+ * struct hl_cs_counters_atomic - command submission counters
+ * @out_of_mem_drop_cnt: dropped due to memory allocation issue
+ * @parsing_drop_cnt: dropped due to error in packet parsing
+ * @queue_full_drop_cnt: dropped due to queue full
+ * @device_in_reset_drop_cnt: dropped due to device in reset
+ * @max_cs_in_flight_drop_cnt: dropped due to maximum CS in-flight
+ */
+struct hl_cs_counters_atomic {
+	atomic64_t out_of_mem_drop_cnt;
+	atomic64_t parsing_drop_cnt;
+	atomic64_t queue_full_drop_cnt;
+	atomic64_t device_in_reset_drop_cnt;
+	atomic64_t max_cs_in_flight_drop_cnt;
 };
 
 /**
@@ -849,14 +1019,12 @@ struct hl_va_range {
  * @refcount: reference counter for the context. Context is released only when
  *		this hits 0l. It is incremented on CS and CS_WAIT.
  * @cs_pending: array of hl fence objects representing pending CS.
- * @host_va_range: holds available virtual addresses for host mappings.
- * @host_huge_va_range: holds available virtual addresses for host mappings
- *                      with huge pages.
- * @dram_va_range: holds available virtual addresses for DRAM mappings.
+ * @va_range: holds available virtual addresses for host and dram mappings.
  * @mem_hash_lock: protects the mem_hash.
  * @mmu_lock: protects the MMU page tables. Any change to the PGT, modifying the
  *            MMU hash or walking the PGT requires talking this lock.
  * @debugfs_list: node in debugfs list of contexts.
+ * @cs_counters: context command submission counters.
  * @cb_va_pool: device VA pool for command buffers which are mapped to the
  *              device's MMU.
  * @cs_sequence: sequence number for CS. Value is assigned to a CS and passed
@@ -879,26 +1047,24 @@ struct hl_va_range {
 struct hl_ctx {
 	DECLARE_HASHTABLE(mem_hash, MEM_HASH_TABLE_BITS);
 	DECLARE_HASHTABLE(mmu_shadow_hash, MMU_HASH_TABLE_BITS);
-	struct hl_fpriv		*hpriv;
-	struct hl_device	*hdev;
-	struct kref		refcount;
-	struct hl_fence		**cs_pending;
-	struct hl_va_range	*host_va_range;
-	struct hl_va_range	*host_huge_va_range;
-	struct hl_va_range	*dram_va_range;
-	struct mutex		mem_hash_lock;
-	struct mutex		mmu_lock;
-	struct list_head	debugfs_list;
-	struct hl_cs_counters	cs_counters;
-	struct gen_pool		*cb_va_pool;
-	u64			cs_sequence;
-	u64			*dram_default_hops;
-	spinlock_t		cs_lock;
-	atomic64_t		dram_phys_mem;
-	atomic_t		thread_ctx_switch_token;
-	u32			thread_ctx_switch_wait_token;
-	u32			asid;
-	u32			handle;
+	struct hl_fpriv			*hpriv;
+	struct hl_device		*hdev;
+	struct kref			refcount;
+	struct hl_fence			**cs_pending;
+	struct hl_va_range		*va_range[HL_VA_RANGE_TYPE_MAX];
+	struct mutex			mem_hash_lock;
+	struct mutex			mmu_lock;
+	struct list_head		debugfs_list;
+	struct hl_cs_counters_atomic	cs_counters;
+	struct gen_pool			*cb_va_pool;
+	u64				cs_sequence;
+	u64				*dram_default_hops;
+	spinlock_t			cs_lock;
+	atomic64_t			dram_phys_mem;
+	atomic_t			thread_ctx_switch_token;
+	u32				thread_ctx_switch_wait_token;
+	u32				asid;
+	u32				handle;
 };
 
 /**
@@ -963,6 +1129,7 @@ struct hl_userptr {
  * @tdr_active: true if TDR was activated for this CS (to prevent
  *		double TDR activation).
  * @aborted: true if CS was aborted due to some device error.
+ * @timestamp: true if a timestmap must be captured upon completion
  */
 struct hl_cs {
 	u16			*jobs_in_queue_cnt;
@@ -983,6 +1150,7 @@ struct hl_cs {
 	u8			timedout;
 	u8			tdr_active;
 	u8			aborted;
+	u8			timestamp;
 };
 
 /**
@@ -996,6 +1164,7 @@ struct hl_cs {
  * @userptr_list: linked-list of userptr mappings that belong to this job and
  *			wait for completion.
  * @debugfs_list: node in debugfs list of command submission jobs.
+ * @refcount: reference counter for usage of the CS job.
  * @queue_type: the type of the H/W queue this job is submitted to.
  * @id: the id of this job inside a CS.
  * @hw_queue_id: the id of the H/W queue this job is submitted to.
@@ -1019,6 +1188,7 @@ struct hl_cs_job {
 	struct work_struct	finish_work;
 	struct list_head	userptr_list;
 	struct list_head	debugfs_list;
+	struct kref		refcount;
 	enum hl_queue_type	queue_type;
 	u32			id;
 	u32			hw_queue_id;
@@ -1067,7 +1237,6 @@ struct hl_cs_parser {
 	u8			contains_dma_pkt;
 };
 
-
 /*
  * MEMORY STRUCTURE
  */
@@ -1285,6 +1454,10 @@ struct hl_dbg_device_entry {
  * DEVICES
  */
 
+#define HL_STR_MAX	32
+
+#define HL_DEV_STS_MAX (HL_DEVICE_STATUS_NEEDS_RESET + 1)
+
 /* Theoretical limit only. A single host can only contain up to 4 or 8 PCIe
  * x16 cards. In extreme cases, there are hosts that can accommodate 16 cards.
  */
@@ -1428,11 +1601,13 @@ struct hwmon_chip_info;
 
 /**
  * struct hl_device_reset_work - reset workqueue task wrapper.
+ * @wq: work queue for device reset procedure.
  * @reset_work: reset work to be done.
  * @hdev: habanalabs device structure.
  */
 struct hl_device_reset_work {
-	struct work_struct		reset_work;
+	struct workqueue_struct		*wq;
+	struct delayed_work		reset_work;
 	struct hl_device		*hdev;
 };
 
@@ -1446,18 +1621,78 @@ struct hl_device_idle_busy_ts {
 	ktime_t				busy_to_idle_ts;
 };
 
+/**
+ * struct hr_mmu_hop_addrs - used for holding per-device host-resident mmu hop
+ * information.
+ * @virt_addr: the virtual address of the hop.
+ * @phys-addr: the physical address of the hop (used by the device-mmu).
+ * @shadow_addr: The shadow of the hop used by the driver for walking the hops.
+ */
+struct hr_mmu_hop_addrs {
+	u64 virt_addr;
+	u64 phys_addr;
+	u64 shadow_addr;
+};
 
 /**
- * struct hl_mmu_priv - used for holding per-device mmu internal information.
+ * struct hl_mmu_hr_pgt_priv - used for holding per-device mmu host-resident
+ * page-table internal information.
  * @mmu_pgt_pool: pool of page tables used by MMU for allocating hops.
  * @mmu_shadow_hop0: shadow array of hop0 tables.
  */
-struct hl_mmu_priv {
+struct hl_mmu_hr_priv {
+	struct gen_pool *mmu_pgt_pool;
+	struct hr_mmu_hop_addrs *mmu_shadow_hop0;
+};
+
+/**
+ * struct hl_mmu_dr_pgt_priv - used for holding per-device mmu device-resident
+ * page-table internal information.
+ * @mmu_pgt_pool: pool of page tables used by MMU for allocating hops.
+ * @mmu_shadow_hop0: shadow array of hop0 tables.
+ */
+struct hl_mmu_dr_priv {
 	struct gen_pool *mmu_pgt_pool;
 	void *mmu_shadow_hop0;
 };
 
 /**
+ * struct hl_mmu_priv - used for holding per-device mmu internal information.
+ * @dr: information on the device-resident MMU, when exists.
+ * @hr: information on the host-resident MMU, when exists.
+ */
+struct hl_mmu_priv {
+	struct hl_mmu_dr_priv dr;
+	struct hl_mmu_hr_priv hr;
+};
+
+/**
+ * struct hl_mmu_per_hop_info - A structure describing one TLB HOP and its entry
+ *                that was created in order to translate a virtual address to a
+ *                physical one.
+ * @hop_addr: The address of the hop.
+ * @hop_pte_addr: The address of the hop entry.
+ * @hop_pte_val: The value in the hop entry.
+ */
+struct hl_mmu_per_hop_info {
+	u64 hop_addr;
+	u64 hop_pte_addr;
+	u64 hop_pte_val;
+};
+
+/**
+ * struct hl_mmu_hop_info - A structure describing the TLB hops and their
+ * hop-entries that were created in order to translate a virtual address to a
+ * physical one.
+ * @hop_info: Array holding the per-hop information used for the translation.
+ * @used_hops: The number of hops used for the translation.
+ */
+struct hl_mmu_hop_info {
+	struct hl_mmu_per_hop_info hop_info[MMU_ARCH_5_HOPS];
+	u32 used_hops;
+};
+
+/**
  * struct hl_mmu_funcs - Device related MMU functions.
  * @init: initialize the MMU module.
  * @fini: release the MMU module.
@@ -1468,6 +1703,9 @@ struct hl_mmu_priv {
  * @flush: flush all writes from all cores to reach device MMU.
  * @swap_out: marks all mapping of the given context as swapped out.
  * @swap_in: marks all mapping of the given context as swapped in.
+ * @get_tlb_info: returns the list of hops and hop-entries used that were
+ *                created in order to translate the giver virtual address to a
+ *                physical one.
  */
 struct hl_mmu_funcs {
 	int (*init)(struct hl_device *hdev);
@@ -1482,6 +1720,8 @@ struct hl_mmu_funcs {
 	void (*flush)(struct hl_ctx *ctx);
 	void (*swap_out)(struct hl_ctx *ctx);
 	void (*swap_in)(struct hl_ctx *ctx);
+	int (*get_tlb_info)(struct hl_ctx *ctx,
+			u64 virt_addr, struct hl_mmu_hop_info *hops);
 };
 
 /**
@@ -1497,6 +1737,7 @@ struct hl_mmu_funcs {
  * @dev_ctrl: related kernel device structure for the control device
  * @work_freq: delayed work to lower device frequency if possible.
  * @work_heartbeat: delayed work for CPU-CP is-alive check.
+ * @device_reset_work: delayed work which performs hard reset
  * @asic_name: ASIC specific name.
  * @asic_type: ASIC specific type.
  * @completion_queue: array of hl_cq.
@@ -1505,8 +1746,8 @@ struct hl_mmu_funcs {
  * @eq_wq: work queue of event queue for executing work in process context.
  * @kernel_ctx: Kernel driver context structure.
  * @kernel_queues: array of hl_hw_queue.
- * @hw_queues_mirror_list: CS mirror list for TDR.
- * @hw_queues_mirror_lock: protects hw_queues_mirror_list.
+ * @cs_mirror_list: CS mirror list for TDR.
+ * @cs_mirror_lock: protects cs_mirror_list.
  * @kernel_cb_mgr: command buffer manager for creating/destroying/handling CGs.
  * @event_queue: event queue for IRQ from CPU-CP.
  * @dma_pool: DMA pool for small allocations.
@@ -1525,6 +1766,7 @@ struct hl_mmu_funcs {
  * @hwmon_dev: H/W monitor device.
  * @pm_mng_profile: current power management profile.
  * @hl_chip_info: ASIC's sensors information.
+ * @device_status_description: device status description.
  * @hl_debugfs: device's debugfs manager.
  * @cb_pool: list of preallocated CBs.
  * @cb_pool_lock: protects the CB pool.
@@ -1572,13 +1814,12 @@ struct hl_mmu_funcs {
  * @heartbeat: is heartbeat sanity check towards CPU-CP enabled.
  * @reset_on_lockup: true if a reset should be done in case of stuck CS, false
  *                   otherwise.
- * @dram_supports_virtual_memory: is MMU enabled towards DRAM.
  * @dram_default_page_mapping: is DRAM default page mapping enabled.
+ * @memory_scrub: true to perform device memory scrub in various locations,
+ *                such as context-switch, context close, page free, etc.
  * @pmmu_huge_range: is a different virtual addresses range used for PMMU with
  *                   huge pages.
  * @init_done: is the initialization of the device done.
- * @mmu_enable: is MMU enabled.
- * @mmu_huge_page_opt: is MMU huge pages optimization enabled.
  * @device_cpu_disabled: is the device CPU disabled (due to timeouts)
  * @dma_mask: the dma mask that was set for this device
  * @in_debug: is device under debug. This, together with fpriv_list, enforces
@@ -1589,9 +1830,16 @@ struct hl_mmu_funcs {
  * @stop_on_err: true if engines should stop on error.
  * @supports_sync_stream: is sync stream supported.
  * @sync_stream_queue_idx: helper index for sync stream queues initialization.
+ * @collective_mon_idx: helper index for collective initialization
  * @supports_coresight: is CoreSight supported.
  * @supports_soft_reset: is soft reset supported.
  * @supports_cb_mapping: is mapping a CB to the device's MMU supported.
+ * @needs_reset: true if reset_on_lockup is false and device should be reset
+ *               due to lockup.
+ * @process_kill_trial_cnt: number of trials reset thread tried killing
+ *                          user processes
+ * @device_fini_pending: true if device_fini was called and might be
+ *                       waiting for the reset thread to finish
  */
 struct hl_device {
 	struct pci_dev			*pdev;
@@ -1604,15 +1852,17 @@ struct hl_device {
 	struct device			*dev_ctrl;
 	struct delayed_work		work_freq;
 	struct delayed_work		work_heartbeat;
-	char				asic_name[32];
+	struct hl_device_reset_work	device_reset_work;
+	char				asic_name[HL_STR_MAX];
+	char				status[HL_DEV_STS_MAX][HL_STR_MAX];
 	enum hl_asic_type		asic_type;
 	struct hl_cq			*completion_queue;
 	struct workqueue_struct		**cq_wq;
 	struct workqueue_struct		*eq_wq;
 	struct hl_ctx			*kernel_ctx;
 	struct hl_hw_queue		*kernel_queues;
-	struct list_head		hw_queues_mirror_list;
-	spinlock_t			hw_queues_mirror_lock;
+	struct list_head		cs_mirror_list;
+	spinlock_t			cs_mirror_lock;
 	struct hl_cb_mgr		kernel_cb_mgr;
 	struct hl_eq			event_queue;
 	struct dma_pool			*dma_pool;
@@ -1649,10 +1899,10 @@ struct hl_device {
 
 	struct hl_device_idle_busy_ts	*idle_busy_ts_arr;
 
-	struct hl_cs_counters		aggregated_cs_counters;
+	struct hl_cs_counters_atomic	aggregated_cs_counters;
 
 	struct hl_mmu_priv		mmu_priv;
-	struct hl_mmu_funcs		mmu_func;
+	struct hl_mmu_funcs		mmu_func[MMU_NUM_PGT_LOCATIONS];
 
 	atomic64_t			dram_used_mem;
 	u64				timeout_jiffies;
@@ -1677,8 +1927,8 @@ struct hl_device {
 	u8				hard_reset_pending;
 	u8				heartbeat;
 	u8				reset_on_lockup;
-	u8				dram_supports_virtual_memory;
 	u8				dram_default_page_mapping;
+	u8				memory_scrub;
 	u8				pmmu_huge_range;
 	u8				init_done;
 	u8				device_cpu_disabled;
@@ -1689,17 +1939,22 @@ struct hl_device {
 	u8				stop_on_err;
 	u8				supports_sync_stream;
 	u8				sync_stream_queue_idx;
+	u8				collective_mon_idx;
 	u8				supports_coresight;
 	u8				supports_soft_reset;
 	u8				supports_cb_mapping;
+	u8				needs_reset;
+	u8				process_kill_trial_cnt;
+	u8				device_fini_pending;
 
 	/* Parameters for bring-up */
+	u64				nic_ports_mask;
+	u64				fw_loading;
 	u8				mmu_enable;
 	u8				mmu_huge_page_opt;
 	u8				cpu_enable;
 	u8				reset_pcilink;
 	u8				cpu_queues_enable;
-	u8				fw_loading;
 	u8				pldm;
 	u8				axi_drain;
 	u8				sram_scrambler_enable;
@@ -1707,6 +1962,7 @@ struct hl_device {
 	u8				hard_reset_on_fw_events;
 	u8				bmc_enable;
 	u8				rl_enable;
+	u8				reset_on_preboot_fail;
 };
 
 
@@ -1793,7 +2049,8 @@ static inline bool hl_mem_area_crosses_range(u64 address, u32 size,
 
 int hl_device_open(struct inode *inode, struct file *filp);
 int hl_device_open_ctrl(struct inode *inode, struct file *filp);
-bool hl_device_disabled_or_in_reset(struct hl_device *hdev);
+bool hl_device_operational(struct hl_device *hdev,
+		enum hl_device_status *status);
 enum hl_device_status hl_device_status(struct hl_device *hdev);
 int hl_device_set_debug_mode(struct hl_device *hdev, bool enable);
 int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
@@ -1878,8 +2135,10 @@ void hl_cs_rollback_all(struct hl_device *hdev);
 struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
 		enum hl_queue_type queue_type, bool is_kernel_allocated_cb);
 void hl_sob_reset_error(struct kref *ref);
+int hl_gen_sob_mask(u16 sob_base, u8 sob_mask, u8 *mask);
 void hl_fence_put(struct hl_fence *fence);
 void hl_fence_get(struct hl_fence *fence);
+void cs_get(struct hl_cs *cs);
 
 void goya_set_asic_funcs(struct hl_device *hdev);
 void gaudi_set_asic_funcs(struct hl_device *hdev);
@@ -1890,6 +2149,10 @@ void hl_vm_ctx_fini(struct hl_ctx *ctx);
 int hl_vm_init(struct hl_device *hdev);
 void hl_vm_fini(struct hl_device *hdev);
 
+u64 hl_reserve_va_block(struct hl_device *hdev, struct hl_ctx *ctx,
+		enum hl_va_range_type type, u32 size, u32 alignment);
+int hl_unreserve_va_block(struct hl_device *hdev, struct hl_ctx *ctx,
+		u64 start_addr, u64 size);
 int hl_pin_host_memory(struct hl_device *hdev, u64 addr, u64 size,
 			struct hl_userptr *userptr);
 void hl_unpin_host_memory(struct hl_device *hdev, struct hl_userptr *userptr);
@@ -1903,20 +2166,26 @@ int hl_mmu_init(struct hl_device *hdev);
 void hl_mmu_fini(struct hl_device *hdev);
 int hl_mmu_ctx_init(struct hl_ctx *ctx);
 void hl_mmu_ctx_fini(struct hl_ctx *ctx);
-int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
+int hl_mmu_map_page(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
 		u32 page_size, bool flush_pte);
-int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size,
+int hl_mmu_unmap_page(struct hl_ctx *ctx, u64 virt_addr, u32 page_size,
 		bool flush_pte);
+int hl_mmu_map_contiguous(struct hl_ctx *ctx, u64 virt_addr,
+					u64 phys_addr, u32 size);
+int hl_mmu_unmap_contiguous(struct hl_ctx *ctx, u64 virt_addr, u32 size);
 void hl_mmu_swap_out(struct hl_ctx *ctx);
 void hl_mmu_swap_in(struct hl_ctx *ctx);
 int hl_mmu_if_set_funcs(struct hl_device *hdev);
-void hl_mmu_v1_set_funcs(struct hl_device *hdev);
+void hl_mmu_v1_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu);
+int hl_mmu_va_to_pa(struct hl_ctx *ctx, u64 virt_addr, u64 *phys_addr);
+int hl_mmu_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr,
+			struct hl_mmu_hop_info *hops);
 
 int hl_fw_load_fw_to_device(struct hl_device *hdev, const char *fw_name,
-				void __iomem *dst);
+				void __iomem *dst, u32 src_offset, u32 size);
 int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode);
 int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
-				u16 len, u32 timeout, long *result);
+				u16 len, u32 timeout, u64 *result);
 int hl_fw_unmask_irq(struct hl_device *hdev, u16 event_type);
 int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr,
 		size_t irq_arr_size);
@@ -1926,18 +2195,22 @@ void *hl_fw_cpu_accessible_dma_pool_alloc(struct hl_device *hdev, size_t size,
 void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
 					void *vaddr);
 int hl_fw_send_heartbeat(struct hl_device *hdev);
-int hl_fw_cpucp_info_get(struct hl_device *hdev);
+int hl_fw_cpucp_info_get(struct hl_device *hdev,
+			u32 cpu_security_boot_status_reg);
 int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size);
 int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev,
 		struct hl_info_pci_counters *counters);
 int hl_fw_cpucp_total_energy_get(struct hl_device *hdev,
 			u64 *total_energy);
+int hl_fw_cpucp_pll_info_get(struct hl_device *hdev, u16 pll_index,
+		u16 *pll_freq_arr);
 int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
 			u32 msg_to_cpu_reg, u32 cpu_msg_status_reg,
-			u32 boot_err0_reg, bool skip_bmc,
-			u32 cpu_timeout, u32 boot_fit_timeout);
-int hl_fw_read_preboot_ver(struct hl_device *hdev, u32 cpu_boot_status_reg,
-				u32 boot_err0_reg, u32 timeout);
+			u32 cpu_security_boot_status_reg, u32 boot_err0_reg,
+			bool skip_bmc, u32 cpu_timeout, u32 boot_fit_timeout);
+int hl_fw_read_preboot_status(struct hl_device *hdev, u32 cpu_boot_status_reg,
+		u32 cpu_security_boot_status_reg, u32 boot_err0_reg,
+		u32 timeout);
 
 int hl_pci_bars_map(struct hl_device *hdev, const char * const name[3],
 			bool is_wc[3]);
@@ -1946,8 +2219,7 @@ int hl_pci_set_inbound_region(struct hl_device *hdev, u8 region,
 		struct hl_inbound_pci_region *pci_region);
 int hl_pci_set_outbound_region(struct hl_device *hdev,
 		struct hl_outbound_pci_region *pci_region);
-int hl_pci_init(struct hl_device *hdev, u32 cpu_boot_status_reg,
-		u32 boot_err0_reg, u32 preboot_ver_timeout);
+int hl_pci_init(struct hl_device *hdev);
 void hl_pci_fini(struct hl_device *hdev);
 
 long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr);
diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c
index f9067d3ef437..6bbb6bca6860 100644
--- a/drivers/misc/habanalabs/common/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@@ -29,6 +29,7 @@ static DEFINE_MUTEX(hl_devs_idr_lock);
 
 static int timeout_locked = 5;
 static int reset_on_lockup = 1;
+static int memory_scrub = 1;
 
 module_param(timeout_locked, int, 0444);
 MODULE_PARM_DESC(timeout_locked,
@@ -38,6 +39,10 @@ module_param(reset_on_lockup, int, 0444);
 MODULE_PARM_DESC(reset_on_lockup,
 	"Do device reset on lockup (0 = no, 1 = yes, default yes)");
 
+module_param(memory_scrub, int, 0444);
+MODULE_PARM_DESC(memory_scrub,
+	"Scrub device memory in various states (0 = no, 1 = yes, default yes)");
+
 #define PCI_VENDOR_ID_HABANALABS	0x1da3
 
 #define PCI_IDS_GOYA			0x0001
@@ -87,6 +92,7 @@ static enum hl_asic_type get_asic_type(u16 device)
  */
 int hl_device_open(struct inode *inode, struct file *filp)
 {
+	enum hl_device_status status;
 	struct hl_device *hdev;
 	struct hl_fpriv *hpriv;
 	int rc;
@@ -119,10 +125,10 @@ int hl_device_open(struct inode *inode, struct file *filp)
 
 	mutex_lock(&hdev->fpriv_list_lock);
 
-	if (hl_device_disabled_or_in_reset(hdev)) {
+	if (!hl_device_operational(hdev, &status)) {
 		dev_err_ratelimited(hdev->dev,
-			"Can't open %s because it is disabled or in reset\n",
-			dev_name(hdev->dev));
+			"Can't open %s because it is %s\n",
+			dev_name(hdev->dev), hdev->status[status]);
 		rc = -EPERM;
 		goto out_err;
 	}
@@ -199,7 +205,7 @@ int hl_device_open_ctrl(struct inode *inode, struct file *filp)
 
 	mutex_lock(&hdev->fpriv_list_lock);
 
-	if (hl_device_disabled_or_in_reset(hdev)) {
+	if (!hl_device_operational(hdev, NULL)) {
 		dev_err_ratelimited(hdev->dev_ctrl,
 			"Can't open %s because it is disabled or in reset\n",
 			dev_name(hdev->dev_ctrl));
@@ -228,19 +234,20 @@ out_err:
 
 static void set_driver_behavior_per_device(struct hl_device *hdev)
 {
-	hdev->mmu_enable = 1;
 	hdev->cpu_enable = 1;
-	hdev->fw_loading = 1;
+	hdev->fw_loading = FW_TYPE_ALL_TYPES;
 	hdev->cpu_queues_enable = 1;
 	hdev->heartbeat = 1;
+	hdev->mmu_enable = 1;
 	hdev->clock_gating_mask = ULONG_MAX;
-
-	hdev->reset_pcilink = 0;
-	hdev->axi_drain = 0;
 	hdev->sram_scrambler_enable = 1;
 	hdev->dram_scrambler_enable = 1;
 	hdev->bmc_enable = 1;
 	hdev->hard_reset_on_fw_events = 1;
+	hdev->reset_on_preboot_fail = 1;
+
+	hdev->reset_pcilink = 0;
+	hdev->axi_drain = 0;
 }
 
 /*
@@ -281,8 +288,17 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
 		hdev->asic_type = asic_type;
 	}
 
+	/* Assign status description string */
+	strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION],
+					"disabled", HL_STR_MAX);
+	strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET],
+					"in reset", HL_STR_MAX);
+	strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET],
+					"needs reset", HL_STR_MAX);
+
 	hdev->major = hl_major;
 	hdev->reset_on_lockup = reset_on_lockup;
+	hdev->memory_scrub = memory_scrub;
 	hdev->pldm = 0;
 
 	set_driver_behavior_per_device(hdev);
diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
index 07317ea49129..32e6af1db4e3 100644
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -314,20 +314,45 @@ static int clk_throttle_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 
 static int cs_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 {
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+	struct hl_info_cs_counters cs_counters = {0};
 	struct hl_device *hdev = hpriv->hdev;
-	struct hl_info_cs_counters cs_counters = { {0} };
+	struct hl_cs_counters_atomic *cntr;
 	u32 max_size = args->return_size;
-	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+
+	cntr = &hdev->aggregated_cs_counters;
 
 	if ((!max_size) || (!out))
 		return -EINVAL;
 
-	memcpy(&cs_counters.cs_counters, &hdev->aggregated_cs_counters,
-			sizeof(struct hl_cs_counters));
-
-	if (hpriv->ctx)
-		memcpy(&cs_counters.ctx_cs_counters, &hpriv->ctx->cs_counters,
-				sizeof(struct hl_cs_counters));
+	cs_counters.total_out_of_mem_drop_cnt =
+			atomic64_read(&cntr->out_of_mem_drop_cnt);
+	cs_counters.total_parsing_drop_cnt =
+			atomic64_read(&cntr->parsing_drop_cnt);
+	cs_counters.total_queue_full_drop_cnt =
+			atomic64_read(&cntr->queue_full_drop_cnt);
+	cs_counters.total_device_in_reset_drop_cnt =
+			atomic64_read(&cntr->device_in_reset_drop_cnt);
+	cs_counters.total_max_cs_in_flight_drop_cnt =
+			atomic64_read(&cntr->max_cs_in_flight_drop_cnt);
+
+	if (hpriv->ctx) {
+		cs_counters.ctx_out_of_mem_drop_cnt =
+				atomic64_read(
+				&hpriv->ctx->cs_counters.out_of_mem_drop_cnt);
+		cs_counters.ctx_parsing_drop_cnt =
+				atomic64_read(
+				&hpriv->ctx->cs_counters.parsing_drop_cnt);
+		cs_counters.ctx_queue_full_drop_cnt =
+				atomic64_read(
+				&hpriv->ctx->cs_counters.queue_full_drop_cnt);
+		cs_counters.ctx_device_in_reset_drop_cnt =
+				atomic64_read(
+			&hpriv->ctx->cs_counters.device_in_reset_drop_cnt);
+		cs_counters.ctx_max_cs_in_flight_drop_cnt =
+				atomic64_read(
+			&hpriv->ctx->cs_counters.max_cs_in_flight_drop_cnt);
+	}
 
 	return copy_to_user(out, &cs_counters,
 		min((size_t) max_size, sizeof(cs_counters))) ? -EFAULT : 0;
@@ -378,11 +403,32 @@ static int total_energy_consumption_info(struct hl_fpriv *hpriv,
 		min((size_t) max_size, sizeof(total_energy))) ? -EFAULT : 0;
 }
 
+static int pll_frequency_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	struct hl_pll_frequency_info freq_info = {0};
+	u32 max_size = args->return_size;
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+	int rc;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	rc = hl_fw_cpucp_pll_info_get(hdev, args->pll_index, freq_info.output);
+	if (rc)
+		return rc;
+
+	return copy_to_user(out, &freq_info,
+		min((size_t) max_size, sizeof(freq_info))) ? -EFAULT : 0;
+}
+
 static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 				struct device *dev)
 {
+	enum hl_device_status status;
 	struct hl_info_args *args = data;
 	struct hl_device *hdev = hpriv->hdev;
+
 	int rc;
 
 	/*
@@ -403,10 +449,10 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 		break;
 	}
 
-	if (hl_device_disabled_or_in_reset(hdev)) {
+	if (!hl_device_operational(hdev, &status)) {
 		dev_warn_ratelimited(dev,
 			"Device is %s. Can't execute INFO IOCTL\n",
-			atomic_read(&hdev->in_reset) ? "in_reset" : "disabled");
+			hdev->status[status]);
 		return -EBUSY;
 	}
 
@@ -453,6 +499,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 	case HL_INFO_TOTAL_ENERGY:
 		return total_energy_consumption_info(hpriv, args);
 
+	case HL_INFO_PLL_FREQUENCY:
+		return pll_frequency_info(hpriv, args);
+
 	default:
 		dev_err(dev, "Invalid request %d\n", args->op);
 		rc = -ENOTTY;
@@ -476,12 +525,14 @@ static int hl_debug_ioctl(struct hl_fpriv *hpriv, void *data)
 {
 	struct hl_debug_args *args = data;
 	struct hl_device *hdev = hpriv->hdev;
+	enum hl_device_status status;
+
 	int rc = 0;
 
-	if (hl_device_disabled_or_in_reset(hdev)) {
+	if (!hl_device_operational(hdev, &status)) {
 		dev_warn_ratelimited(hdev->dev,
 			"Device is %s. Can't execute DEBUG IOCTL\n",
-			atomic_read(&hdev->in_reset) ? "in_reset" : "disabled");
+			hdev->status[status]);
 		return -EBUSY;
 	}
 
@@ -544,7 +595,7 @@ static long _hl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg,
 	int retcode;
 
 	if (hdev->hard_reset_pending) {
-		dev_crit_ratelimited(hdev->dev_ctrl,
+		dev_crit_ratelimited(dev,
 			"Device HARD reset pending! Please close FD\n");
 		return -ENODEV;
 	}
diff --git a/drivers/misc/habanalabs/common/hw_queue.c b/drivers/misc/habanalabs/common/hw_queue.c
index 250cf9cefc06..7caf868d1585 100644
--- a/drivers/misc/habanalabs/common/hw_queue.c
+++ b/drivers/misc/habanalabs/common/hw_queue.c
@@ -48,6 +48,11 @@ void hl_int_hw_queue_update_ci(struct hl_cs *cs)
 		return;
 
 	q = &hdev->kernel_queues[0];
+
+	/* There are no internal queues if H/W queues are being used */
+	if (!hdev->asic_prop.max_queues || q->queue_type == QUEUE_TYPE_HW)
+		return;
+
 	for (i = 0 ; i < hdev->asic_prop.max_queues ; i++, q++) {
 		if (q->queue_type == QUEUE_TYPE_INT)
 			atomic_add(cs->jobs_in_queue_cnt[i], &q->ci);
@@ -333,7 +338,14 @@ static void int_queue_schedule_job(struct hl_cs_job *job)
 
 	bd.ctl = 0;
 	bd.len = cpu_to_le32(job->job_cb_size);
-	bd.ptr = cpu_to_le64((u64) (uintptr_t) job->user_cb);
+
+	if (job->is_kernel_allocated_cb)
+		/* bus_address is actually a mmu mapped address
+		 * allocated from an internal pool
+		 */
+		bd.ptr = cpu_to_le64(job->user_cb->bus_address);
+	else
+		bd.ptr = cpu_to_le64((u64) (uintptr_t) job->user_cb);
 
 	pi = q->kernel_address + (q->pi & (q->int_queue_len - 1)) * sizeof(bd);
 
@@ -388,6 +400,91 @@ static void hw_queue_schedule_job(struct hl_cs_job *job)
 	ext_and_hw_queue_submit_bd(hdev, q, ctl, len, ptr);
 }
 
+static void init_signal_cs(struct hl_device *hdev,
+		struct hl_cs_job *job, struct hl_cs_compl *cs_cmpl)
+{
+	struct hl_sync_stream_properties *prop;
+	struct hl_hw_sob *hw_sob;
+	u32 q_idx;
+
+	q_idx = job->hw_queue_id;
+	prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
+	hw_sob = &prop->hw_sob[prop->curr_sob_offset];
+
+	cs_cmpl->hw_sob = hw_sob;
+	cs_cmpl->sob_val = prop->next_sob_val++;
+
+	dev_dbg(hdev->dev,
+		"generate signal CB, sob_id: %d, sob val: 0x%x, q_idx: %d\n",
+		cs_cmpl->hw_sob->sob_id, cs_cmpl->sob_val, q_idx);
+
+	hdev->asic_funcs->gen_signal_cb(hdev, job->patched_cb,
+				cs_cmpl->hw_sob->sob_id, 0);
+
+	kref_get(&hw_sob->kref);
+
+	/* check for wraparound */
+	if (prop->next_sob_val == HL_MAX_SOB_VAL) {
+		/*
+		 * Decrement as we reached the max value.
+		 * The release function won't be called here as we've
+		 * just incremented the refcount.
+		 */
+		kref_put(&hw_sob->kref, hl_sob_reset_error);
+		prop->next_sob_val = 1;
+		/* only two SOBs are currently in use */
+		prop->curr_sob_offset =
+			(prop->curr_sob_offset + 1) % HL_RSVD_SOBS;
+
+		dev_dbg(hdev->dev, "switched to SOB %d, q_idx: %d\n",
+				prop->curr_sob_offset, q_idx);
+	}
+}
+
+static void init_wait_cs(struct hl_device *hdev, struct hl_cs *cs,
+		struct hl_cs_job *job, struct hl_cs_compl *cs_cmpl)
+{
+	struct hl_cs_compl *signal_cs_cmpl;
+	struct hl_sync_stream_properties *prop;
+	struct hl_gen_wait_properties wait_prop;
+	u32 q_idx;
+
+	q_idx = job->hw_queue_id;
+	prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
+
+	signal_cs_cmpl = container_of(cs->signal_fence,
+					struct hl_cs_compl,
+					base_fence);
+
+	/* copy the SOB id and value of the signal CS */
+	cs_cmpl->hw_sob = signal_cs_cmpl->hw_sob;
+	cs_cmpl->sob_val = signal_cs_cmpl->sob_val;
+
+	dev_dbg(hdev->dev,
+		"generate wait CB, sob_id: %d, sob_val: 0x%x, mon_id: %d, q_idx: %d\n",
+		cs_cmpl->hw_sob->sob_id, cs_cmpl->sob_val,
+		prop->base_mon_id, q_idx);
+
+	wait_prop.data = (void *) job->patched_cb;
+	wait_prop.sob_base = cs_cmpl->hw_sob->sob_id;
+	wait_prop.sob_mask = 0x1;
+	wait_prop.sob_val = cs_cmpl->sob_val;
+	wait_prop.mon_id = prop->base_mon_id;
+	wait_prop.q_idx = q_idx;
+	wait_prop.size = 0;
+	hdev->asic_funcs->gen_wait_cb(hdev, &wait_prop);
+
+	kref_get(&cs_cmpl->hw_sob->kref);
+	/*
+	 * Must put the signal fence after the SOB refcnt increment so
+	 * the SOB refcnt won't turn 0 and reset the SOB before the
+	 * wait CS was submitted.
+	 */
+	mb();
+	hl_fence_put(cs->signal_fence);
+	cs->signal_fence = NULL;
+}
+
 /*
  * init_signal_wait_cs - initialize a signal/wait CS
  * @cs: pointer to the signal/wait CS
@@ -398,84 +495,18 @@ static void init_signal_wait_cs(struct hl_cs *cs)
 {
 	struct hl_ctx *ctx = cs->ctx;
 	struct hl_device *hdev = ctx->hdev;
-	struct hl_hw_queue *hw_queue;
+	struct hl_cs_job *job;
 	struct hl_cs_compl *cs_cmpl =
 			container_of(cs->fence, struct hl_cs_compl, base_fence);
 
-	struct hl_hw_sob *hw_sob;
-	struct hl_cs_job *job;
-	u32 q_idx;
-
 	/* There is only one job in a signal/wait CS */
 	job = list_first_entry(&cs->job_list, struct hl_cs_job,
 				cs_node);
-	q_idx = job->hw_queue_id;
-	hw_queue = &hdev->kernel_queues[q_idx];
-
-	if (cs->type & CS_TYPE_SIGNAL) {
-		hw_sob = &hw_queue->hw_sob[hw_queue->curr_sob_offset];
-
-		cs_cmpl->hw_sob = hw_sob;
-		cs_cmpl->sob_val = hw_queue->next_sob_val++;
-
-		dev_dbg(hdev->dev,
-			"generate signal CB, sob_id: %d, sob val: 0x%x, q_idx: %d\n",
-			cs_cmpl->hw_sob->sob_id, cs_cmpl->sob_val, q_idx);
-
-		hdev->asic_funcs->gen_signal_cb(hdev, job->patched_cb,
-					cs_cmpl->hw_sob->sob_id);
-
-		kref_get(&hw_sob->kref);
-
-		/* check for wraparound */
-		if (hw_queue->next_sob_val == HL_MAX_SOB_VAL) {
-			/*
-			 * Decrement as we reached the max value.
-			 * The release function won't be called here as we've
-			 * just incremented the refcount.
-			 */
-			kref_put(&hw_sob->kref, hl_sob_reset_error);
-			hw_queue->next_sob_val = 1;
-			/* only two SOBs are currently in use */
-			hw_queue->curr_sob_offset =
-					(hw_queue->curr_sob_offset + 1) %
-						HL_RSVD_SOBS_IN_USE;
-
-			dev_dbg(hdev->dev, "switched to SOB %d, q_idx: %d\n",
-					hw_queue->curr_sob_offset, q_idx);
-		}
-	} else if (cs->type & CS_TYPE_WAIT) {
-		struct hl_cs_compl *signal_cs_cmpl;
-
-		signal_cs_cmpl = container_of(cs->signal_fence,
-						struct hl_cs_compl,
-						base_fence);
-
-		/* copy the the SOB id and value of the signal CS */
-		cs_cmpl->hw_sob = signal_cs_cmpl->hw_sob;
-		cs_cmpl->sob_val = signal_cs_cmpl->sob_val;
-
-		dev_dbg(hdev->dev,
-			"generate wait CB, sob_id: %d, sob_val: 0x%x, mon_id: %d, q_idx: %d\n",
-			cs_cmpl->hw_sob->sob_id, cs_cmpl->sob_val,
-			hw_queue->base_mon_id, q_idx);
 
-		hdev->asic_funcs->gen_wait_cb(hdev, job->patched_cb,
-						cs_cmpl->hw_sob->sob_id,
-						cs_cmpl->sob_val,
-						hw_queue->base_mon_id,
-						q_idx);
-
-		kref_get(&cs_cmpl->hw_sob->kref);
-		/*
-		 * Must put the signal fence after the SOB refcnt increment so
-		 * the SOB refcnt won't turn 0 and reset the SOB before the
-		 * wait CS was submitted.
-		 */
-		mb();
-		hl_fence_put(cs->signal_fence);
-		cs->signal_fence = NULL;
-	}
+	if (cs->type & CS_TYPE_SIGNAL)
+		init_signal_cs(hdev, job, cs_cmpl);
+	else if (cs->type & CS_TYPE_WAIT)
+		init_wait_cs(hdev, cs, job, cs_cmpl);
 }
 
 /*
@@ -484,19 +515,24 @@ static void init_signal_wait_cs(struct hl_cs *cs)
  */
 int hl_hw_queue_schedule_cs(struct hl_cs *cs)
 {
+	enum hl_device_status status;
+	struct hl_cs_counters_atomic *cntr;
 	struct hl_ctx *ctx = cs->ctx;
 	struct hl_device *hdev = ctx->hdev;
 	struct hl_cs_job *job, *tmp;
 	struct hl_hw_queue *q;
-	u32 max_queues;
 	int rc = 0, i, cq_cnt;
+	u32 max_queues;
+
+	cntr = &hdev->aggregated_cs_counters;
 
 	hdev->asic_funcs->hw_queues_lock(hdev);
 
-	if (hl_device_disabled_or_in_reset(hdev)) {
-		ctx->cs_counters.device_in_reset_drop_cnt++;
+	if (!hl_device_operational(hdev, &status)) {
+		atomic64_inc(&cntr->device_in_reset_drop_cnt);
+		atomic64_inc(&ctx->cs_counters.device_in_reset_drop_cnt);
 		dev_err(hdev->dev,
-			"device is disabled or in reset, CS rejected!\n");
+			"device is %s, CS rejected!\n", hdev->status[status]);
 		rc = -EPERM;
 		goto out;
 	}
@@ -527,7 +563,9 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
 			}
 
 			if (rc) {
-				ctx->cs_counters.queue_full_drop_cnt++;
+				atomic64_inc(
+					&ctx->cs_counters.queue_full_drop_cnt);
+				atomic64_inc(&cntr->queue_full_drop_cnt);
 				goto unroll_cq_resv;
 			}
 
@@ -538,21 +576,23 @@ int hl_hw_queue_schedule_cs(struct hl_cs *cs)
 
 	if ((cs->type == CS_TYPE_SIGNAL) || (cs->type == CS_TYPE_WAIT))
 		init_signal_wait_cs(cs);
+	else if (cs->type == CS_TYPE_COLLECTIVE_WAIT)
+		hdev->asic_funcs->collective_wait_init_cs(cs);
 
-	spin_lock(&hdev->hw_queues_mirror_lock);
-	list_add_tail(&cs->mirror_node, &hdev->hw_queues_mirror_list);
+	spin_lock(&hdev->cs_mirror_lock);
+	list_add_tail(&cs->mirror_node, &hdev->cs_mirror_list);
 
 	/* Queue TDR if the CS is the first entry and if timeout is wanted */
 	if ((hdev->timeout_jiffies != MAX_SCHEDULE_TIMEOUT) &&
-			(list_first_entry(&hdev->hw_queues_mirror_list,
+			(list_first_entry(&hdev->cs_mirror_list,
 					struct hl_cs, mirror_node) == cs)) {
 		cs->tdr_active = true;
 		schedule_delayed_work(&cs->work_tdr, hdev->timeout_jiffies);
-		spin_unlock(&hdev->hw_queues_mirror_lock);
-	} else {
-		spin_unlock(&hdev->hw_queues_mirror_lock);
+
 	}
 
+	spin_unlock(&hdev->cs_mirror_lock);
+
 	if (!hdev->cs_active_cnt++) {
 		struct hl_device_idle_busy_ts *ts;
 
@@ -714,22 +754,56 @@ static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
 
 static void sync_stream_queue_init(struct hl_device *hdev, u32 q_idx)
 {
-	struct hl_hw_queue *hw_queue = &hdev->kernel_queues[q_idx];
+	struct hl_sync_stream_properties *sync_stream_prop;
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	struct hl_hw_sob *hw_sob;
-	int sob, queue_idx = hdev->sync_stream_queue_idx++;
+	int sob, reserved_mon_idx, queue_idx;
+
+	sync_stream_prop = &hdev->kernel_queues[q_idx].sync_stream_prop;
+
+	/* We use 'collective_mon_idx' as a running index in order to reserve
+	 * monitors for collective master/slave queues.
+	 * collective master queue gets 2 reserved monitors
+	 * collective slave queue gets 1 reserved monitor
+	 */
+	if (hdev->kernel_queues[q_idx].collective_mode ==
+			HL_COLLECTIVE_MASTER) {
+		reserved_mon_idx = hdev->collective_mon_idx;
+
+		/* reserve the first monitor for collective master queue */
+		sync_stream_prop->collective_mstr_mon_id[0] =
+			prop->collective_first_mon + reserved_mon_idx;
+
+		/* reserve the second monitor for collective master queue */
+		sync_stream_prop->collective_mstr_mon_id[1] =
+			prop->collective_first_mon + reserved_mon_idx + 1;
+
+		hdev->collective_mon_idx += HL_COLLECTIVE_RSVD_MSTR_MONS;
+	} else if (hdev->kernel_queues[q_idx].collective_mode ==
+			HL_COLLECTIVE_SLAVE) {
+		reserved_mon_idx = hdev->collective_mon_idx++;
+
+		/* reserve a monitor for collective slave queue */
+		sync_stream_prop->collective_slave_mon_id =
+			prop->collective_first_mon + reserved_mon_idx;
+	}
+
+	if (!hdev->kernel_queues[q_idx].supports_sync_stream)
+		return;
+
+	queue_idx = hdev->sync_stream_queue_idx++;
 
-	hw_queue->base_sob_id =
-		prop->sync_stream_first_sob + queue_idx * HL_RSVD_SOBS;
-	hw_queue->base_mon_id =
-		prop->sync_stream_first_mon + queue_idx * HL_RSVD_MONS;
-	hw_queue->next_sob_val = 1;
-	hw_queue->curr_sob_offset = 0;
+	sync_stream_prop->base_sob_id = prop->sync_stream_first_sob +
+			(queue_idx * HL_RSVD_SOBS);
+	sync_stream_prop->base_mon_id = prop->sync_stream_first_mon +
+			(queue_idx * HL_RSVD_MONS);
+	sync_stream_prop->next_sob_val = 1;
+	sync_stream_prop->curr_sob_offset = 0;
 
 	for (sob = 0 ; sob < HL_RSVD_SOBS ; sob++) {
-		hw_sob = &hw_queue->hw_sob[sob];
+		hw_sob = &sync_stream_prop->hw_sob[sob];
 		hw_sob->hdev = hdev;
-		hw_sob->sob_id = hw_queue->base_sob_id + sob;
+		hw_sob->sob_id = sync_stream_prop->base_sob_id + sob;
 		hw_sob->q_idx = q_idx;
 		kref_init(&hw_sob->kref);
 	}
@@ -737,15 +811,16 @@ static void sync_stream_queue_init(struct hl_device *hdev, u32 q_idx)
 
 static void sync_stream_queue_reset(struct hl_device *hdev, u32 q_idx)
 {
-	struct hl_hw_queue *hw_queue = &hdev->kernel_queues[q_idx];
+	struct hl_sync_stream_properties *prop =
+			&hdev->kernel_queues[q_idx].sync_stream_prop;
 
 	/*
 	 * In case we got here due to a stuck CS, the refcnt might be bigger
 	 * than 1 and therefore we reset it.
 	 */
-	kref_init(&hw_queue->hw_sob[hw_queue->curr_sob_offset].kref);
-	hw_queue->curr_sob_offset = 0;
-	hw_queue->next_sob_val = 1;
+	kref_init(&prop->hw_sob[prop->curr_sob_offset].kref);
+	prop->curr_sob_offset = 0;
+	prop->next_sob_val = 1;
 }
 
 /*
@@ -788,8 +863,7 @@ static int queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
 		break;
 	}
 
-	if (q->supports_sync_stream)
-		sync_stream_queue_init(hdev, q->hw_queue_id);
+	sync_stream_queue_init(hdev, q->hw_queue_id);
 
 	if (rc)
 		return rc;
@@ -867,6 +941,7 @@ int hl_hw_queues_create(struct hl_device *hdev)
 		q->queue_type = asic->hw_queues_props[i].type;
 		q->supports_sync_stream =
 				asic->hw_queues_props[i].supports_sync_stream;
+		q->collective_mode = asic->hw_queues_props[i].collective_mode;
 		rc = queue_init(hdev, q, i);
 		if (rc) {
 			dev_err(hdev->dev,
diff --git a/drivers/misc/habanalabs/common/hwmon.c b/drivers/misc/habanalabs/common/hwmon.c
index 2ac29cb2fe61..6b421d76b311 100644
--- a/drivers/misc/habanalabs/common/hwmon.c
+++ b/drivers/misc/habanalabs/common/hwmon.c
@@ -114,7 +114,7 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type,
 	struct hl_device *hdev = dev_get_drvdata(dev);
 	int rc;
 
-	if (hl_device_disabled_or_in_reset(hdev))
+	if (!hl_device_operational(hdev, NULL))
 		return -ENODEV;
 
 	switch (type) {
@@ -192,7 +192,7 @@ static int hl_write(struct device *dev, enum hwmon_sensor_types type,
 {
 	struct hl_device *hdev = dev_get_drvdata(dev);
 
-	if (hl_device_disabled_or_in_reset(hdev))
+	if (!hl_device_operational(hdev, NULL))
 		return -ENODEV;
 
 	switch (type) {
@@ -312,6 +312,7 @@ int hl_get_temperature(struct hl_device *hdev,
 			int sensor_index, u32 attr, long *value)
 {
 	struct cpucp_packet pkt;
+	u64 result;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
@@ -322,7 +323,9 @@ int hl_get_temperature(struct hl_device *hdev,
 	pkt.type = __cpu_to_le16(attr);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-						0, value);
+						0, &result);
+
+	*value = (long) result;
 
 	if (rc) {
 		dev_err(hdev->dev,
@@ -363,6 +366,7 @@ int hl_get_voltage(struct hl_device *hdev,
 			int sensor_index, u32 attr, long *value)
 {
 	struct cpucp_packet pkt;
+	u64 result;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
@@ -373,7 +377,9 @@ int hl_get_voltage(struct hl_device *hdev,
 	pkt.type = __cpu_to_le16(attr);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-						0, value);
+						0, &result);
+
+	*value = (long) result;
 
 	if (rc) {
 		dev_err(hdev->dev,
@@ -389,6 +395,7 @@ int hl_get_current(struct hl_device *hdev,
 			int sensor_index, u32 attr, long *value)
 {
 	struct cpucp_packet pkt;
+	u64 result;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
@@ -399,7 +406,9 @@ int hl_get_current(struct hl_device *hdev,
 	pkt.type = __cpu_to_le16(attr);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-						0, value);
+						0, &result);
+
+	*value = (long) result;
 
 	if (rc) {
 		dev_err(hdev->dev,
@@ -415,6 +424,7 @@ int hl_get_fan_speed(struct hl_device *hdev,
 			int sensor_index, u32 attr, long *value)
 {
 	struct cpucp_packet pkt;
+	u64 result;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
@@ -425,7 +435,9 @@ int hl_get_fan_speed(struct hl_device *hdev,
 	pkt.type = __cpu_to_le16(attr);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-						0, value);
+						0, &result);
+
+	*value = (long) result;
 
 	if (rc) {
 		dev_err(hdev->dev,
@@ -441,6 +453,7 @@ int hl_get_pwm_info(struct hl_device *hdev,
 			int sensor_index, u32 attr, long *value)
 {
 	struct cpucp_packet pkt;
+	u64 result;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
@@ -451,7 +464,9 @@ int hl_get_pwm_info(struct hl_device *hdev,
 	pkt.type = __cpu_to_le16(attr);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-						0, value);
+						0, &result);
+
+	*value = (long) result;
 
 	if (rc) {
 		dev_err(hdev->dev,
@@ -542,7 +557,7 @@ int hl_hwmon_init(struct hl_device *hdev)
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	int rc;
 
-	if ((hdev->hwmon_initialized) || !(hdev->fw_loading))
+	if ((hdev->hwmon_initialized) || !(hdev->cpu_queues_enable))
 		return 0;
 
 	if (hdev->hl_chip_info->info) {
diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c
index 84227819e4d1..cbe9da4e0211 100644
--- a/drivers/misc/habanalabs/common/memory.c
+++ b/drivers/misc/habanalabs/common/memory.c
@@ -11,7 +11,6 @@
 
 #include <linux/uaccess.h>
 #include <linux/slab.h>
-#include <linux/genalloc.h>
 
 #define HL_MMU_DEBUG	0
 
@@ -46,7 +45,7 @@
  * @ret_handle          : result handle
  *
  * This function does the following:
- * - Allocate the requested size rounded up to 2MB pages
+ * - Allocate the requested size rounded up to 'dram_page_size' pages
  * - Return unique handle
  */
 static int alloc_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args,
@@ -81,6 +80,16 @@ static int alloc_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args,
 				num_pgs, total_size);
 			return -ENOMEM;
 		}
+
+		if (hdev->memory_scrub) {
+			rc = hdev->asic_funcs->scrub_device_mem(hdev, paddr,
+					total_size);
+			if (rc) {
+				dev_err(hdev->dev,
+					"Failed to scrub contiguous device memory\n");
+				goto pages_pack_err;
+			}
+		}
 	}
 
 	phys_pg_pack = kzalloc(sizeof(*phys_pg_pack), GFP_KERNEL);
@@ -118,6 +127,17 @@ static int alloc_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args,
 				goto page_err;
 			}
 
+			if (hdev->memory_scrub) {
+				rc = hdev->asic_funcs->scrub_device_mem(hdev,
+						phys_pg_pack->pages[i],
+						page_size);
+				if (rc) {
+					dev_err(hdev->dev,
+						"Failed to scrub device memory\n");
+					goto page_err;
+				}
+			}
+
 			num_curr_pgs++;
 		}
 	}
@@ -601,6 +621,87 @@ out:
 }
 
 /*
+ * hl_reserve_va_block() - reserve a virtual block of a given size.
+ * @hdev: pointer to the habanalabs device structure.
+ * @ctx: current context
+ * @type: virtual addresses range type.
+ * @size: requested block size.
+ * @alignment: required alignment in bytes of the virtual block start address,
+ *             0 means no alignment.
+ *
+ * This function does the following:
+ * - Iterate on the virtual block list to find a suitable virtual block for the
+ *   given size and alignment.
+ * - Reserve the requested block and update the list.
+ * - Return the start address of the virtual block.
+ */
+u64 hl_reserve_va_block(struct hl_device *hdev, struct hl_ctx *ctx,
+		enum hl_va_range_type type, u32 size, u32 alignment)
+{
+	return get_va_block(hdev, ctx->va_range[type], size, 0,
+			max(alignment, ctx->va_range[type]->page_size));
+}
+
+/**
+ * hl_get_va_range_type() - get va_range type for the given address and size.
+ * @address: The start address of the area we want to validate.
+ * @size: The size in bytes of the area we want to validate.
+ * @type: returned va_range type
+ *
+ * Return: true if the area is inside a valid range, false otherwise.
+ */
+static int hl_get_va_range_type(struct hl_ctx *ctx, u64 address, u64 size,
+			enum hl_va_range_type *type)
+{
+	int i;
+
+	for (i = 0 ; i < HL_VA_RANGE_TYPE_MAX; i++) {
+		if (hl_mem_area_inside_range(address, size,
+				ctx->va_range[i]->start_addr,
+				ctx->va_range[i]->end_addr)) {
+			*type = i;
+			return 0;
+		}
+	}
+
+	return -EINVAL;
+}
+
+/*
+ * hl_unreserve_va_block - wrapper for add_va_block for unreserving a va block
+ *
+ * @hdev: pointer to the habanalabs device structure
+ * @ctx: current context
+ * @start: start virtual address
+ * @end: end virtual address
+ *
+ * This function does the following:
+ * - Takes the list lock and calls add_va_block_locked
+ */
+int hl_unreserve_va_block(struct hl_device *hdev, struct hl_ctx *ctx,
+		u64 start_addr, u64 size)
+{
+	enum hl_va_range_type type;
+	int rc;
+
+	rc = hl_get_va_range_type(ctx, start_addr, size, &type);
+	if (rc) {
+		dev_err(hdev->dev,
+			"cannot find va_range for va %#llx size %llu",
+			start_addr, size);
+		return rc;
+	}
+
+	rc = add_va_block(hdev, ctx->va_range[type], start_addr,
+						start_addr + size - 1);
+	if (rc)
+		dev_warn(hdev->dev,
+			"add va block failed for vaddr: 0x%llx\n", start_addr);
+
+	return rc;
+}
+
+/*
  * get_sg_info - get number of pages and the DMA address from SG list
  *
  * @sg                 : the SG list
@@ -742,7 +843,7 @@ static int map_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,
 	for (i = 0 ; i < phys_pg_pack->npages ; i++) {
 		paddr = phys_pg_pack->pages[i];
 
-		rc = hl_mmu_map(ctx, next_vaddr, paddr, page_size,
+		rc = hl_mmu_map_page(ctx, next_vaddr, paddr, page_size,
 				(i + 1) == phys_pg_pack->npages);
 		if (rc) {
 			dev_err(hdev->dev,
@@ -761,7 +862,7 @@ static int map_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,
 err:
 	next_vaddr = vaddr;
 	for (i = 0 ; i < mapped_pg_cnt ; i++) {
-		if (hl_mmu_unmap(ctx, next_vaddr, page_size,
+		if (hl_mmu_unmap_page(ctx, next_vaddr, page_size,
 					(i + 1) == mapped_pg_cnt))
 			dev_warn_ratelimited(hdev->dev,
 				"failed to unmap handle %u, va: 0x%llx, pa: 0x%llx, page size: %u\n",
@@ -791,7 +892,7 @@ static void unmap_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,
 	next_vaddr = vaddr;
 
 	for (i = 0 ; i < phys_pg_pack->npages ; i++, next_vaddr += page_size) {
-		if (hl_mmu_unmap(ctx, next_vaddr, page_size,
+		if (hl_mmu_unmap_page(ctx, next_vaddr, page_size,
 				       (i + 1) == phys_pg_pack->npages))
 			dev_warn_ratelimited(hdev->dev,
 			"unmap failed for vaddr: 0x%llx\n", next_vaddr);
@@ -888,7 +989,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 
 		/* get required alignment */
 		if (phys_pg_pack->page_size == page_size) {
-			va_range = ctx->host_va_range;
+			va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST];
 
 			/*
 			 * huge page alignment may be needed in case of regular
@@ -903,7 +1004,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 			 * huge page alignment is needed in case of huge page
 			 * mapping
 			 */
-			va_range = ctx->host_huge_va_range;
+			va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE];
 			va_block_align = huge_page_size;
 		}
 	} else {
@@ -928,7 +1029,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 		hint_addr = args->map_device.hint_addr;
 
 		/* DRAM VA alignment is the same as the DRAM page size */
-		va_range = ctx->dram_va_range;
+		va_range = ctx->va_range[HL_VA_RANGE_TYPE_DRAM];
 		va_block_align = hdev->asic_prop.dmmu.page_size;
 	}
 
@@ -1073,12 +1174,12 @@ static int unmap_device_va(struct hl_ctx *ctx, u64 vaddr, bool ctx_free)
 
 		if (phys_pg_pack->page_size ==
 					hdev->asic_prop.pmmu.page_size)
-			va_range = ctx->host_va_range;
+			va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST];
 		else
-			va_range = ctx->host_huge_va_range;
+			va_range = ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE];
 	} else if (*vm_type == VM_TYPE_PHYS_PACK) {
 		is_userptr = false;
-		va_range = ctx->dram_va_range;
+		va_range = ctx->va_range[HL_VA_RANGE_TYPE_DRAM];
 		phys_pg_pack = hnode->ptr;
 	} else {
 		dev_warn(hdev->dev,
@@ -1217,6 +1318,7 @@ out:
 
 int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data)
 {
+	enum hl_device_status status;
 	union hl_mem_args *args = data;
 	struct hl_device *hdev = hpriv->hdev;
 	struct hl_ctx *ctx = hpriv->ctx;
@@ -1224,10 +1326,10 @@ int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data)
 	u32 handle = 0;
 	int rc;
 
-	if (hl_device_disabled_or_in_reset(hdev)) {
+	if (!hl_device_operational(hdev, &status)) {
 		dev_warn_ratelimited(hdev->dev,
 			"Device is %s. Can't execute MEMORY IOCTL\n",
-			atomic_read(&hdev->in_reset) ? "in_reset" : "disabled");
+			hdev->status[status]);
 		return -EBUSY;
 	}
 
@@ -1236,18 +1338,35 @@ int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data)
 
 	switch (args->in.op) {
 	case HL_MEM_OP_ALLOC:
-		if (!hdev->dram_supports_virtual_memory) {
-			dev_err(hdev->dev, "DRAM alloc is not supported\n");
-			rc = -EINVAL;
-			goto out;
-		}
-
 		if (args->in.alloc.mem_size == 0) {
 			dev_err(hdev->dev,
 				"alloc size must be larger than 0\n");
 			rc = -EINVAL;
 			goto out;
 		}
+
+		/* If DRAM does not support virtual memory the driver won't
+		 * handle the allocation/freeing of that memory. However, for
+		 * system administration/monitoring purposes, the driver will
+		 * keep track of the amount of DRAM memory that is allocated
+		 * and freed by the user. Because this code totally relies on
+		 * the user's input, the driver can't ensure the validity
+		 * of this accounting.
+		 */
+		if (!hdev->asic_prop.dram_supports_virtual_memory) {
+			atomic64_add(args->in.alloc.mem_size,
+					&ctx->dram_phys_mem);
+			atomic64_add(args->in.alloc.mem_size,
+					&hdev->dram_used_mem);
+
+			dev_dbg(hdev->dev, "DRAM alloc is not supported\n");
+			rc = 0;
+
+			memset(args, 0, sizeof(*args));
+			args->out.handle = 0;
+			goto out;
+		}
+
 		rc = alloc_device_memory(ctx, &args->in, &handle);
 
 		memset(args, 0, sizeof(*args));
@@ -1255,6 +1374,26 @@ int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data)
 		break;
 
 	case HL_MEM_OP_FREE:
+		/* If DRAM does not support virtual memory the driver won't
+		 * handle the allocation/freeing of that memory. However, for
+		 * system administration/monitoring purposes, the driver will
+		 * keep track of the amount of DRAM memory that is allocated
+		 * and freed by the user. Because this code totally relies on
+		 * the user's input, the driver can't ensure the validity
+		 * of this accounting.
+		 */
+		if (!hdev->asic_prop.dram_supports_virtual_memory) {
+			atomic64_sub(args->in.alloc.mem_size,
+					&ctx->dram_phys_mem);
+			atomic64_sub(args->in.alloc.mem_size,
+					&hdev->dram_used_mem);
+
+			dev_dbg(hdev->dev, "DRAM alloc is not supported\n");
+			rc = 0;
+
+			goto out;
+		}
+
 		rc = free_device_memory(ctx, args->in.free.handle);
 		break;
 
@@ -1498,7 +1637,7 @@ bool hl_userptr_is_pinned(struct hl_device *hdev, u64 addr,
  *   addresses.
  */
 static int va_range_init(struct hl_device *hdev, struct hl_va_range *va_range,
-				u64 start, u64 end)
+				u64 start, u64 end, u32 page_size)
 {
 	int rc;
 
@@ -1528,6 +1667,7 @@ static int va_range_init(struct hl_device *hdev, struct hl_va_range *va_range,
 
 	va_range->start_addr = start;
 	va_range->end_addr = end;
+	va_range->page_size = page_size;
 
 	return 0;
 }
@@ -1540,8 +1680,7 @@ static int va_range_init(struct hl_device *hdev, struct hl_va_range *va_range,
  * This function does the following:
  * - Frees the virtual addresses block list and its lock
  */
-static void va_range_fini(struct hl_device *hdev,
-		struct hl_va_range *va_range)
+static void va_range_fini(struct hl_device *hdev, struct hl_va_range *va_range)
 {
 	mutex_lock(&va_range->lock);
 	clear_va_list_locked(hdev, &va_range->list);
@@ -1571,101 +1710,97 @@ static void va_range_fini(struct hl_device *hdev,
 static int vm_ctx_init_with_ranges(struct hl_ctx *ctx,
 					u64 host_range_start,
 					u64 host_range_end,
+					u32 host_page_size,
 					u64 host_huge_range_start,
 					u64 host_huge_range_end,
+					u32 host_huge_page_size,
 					u64 dram_range_start,
-					u64 dram_range_end)
+					u64 dram_range_end,
+					u32 dram_page_size)
 {
 	struct hl_device *hdev = ctx->hdev;
-	int rc;
-
-	ctx->host_va_range = kzalloc(sizeof(*ctx->host_va_range), GFP_KERNEL);
-	if (!ctx->host_va_range)
-		return -ENOMEM;
-
-	ctx->host_huge_va_range = kzalloc(sizeof(*ctx->host_huge_va_range),
-						GFP_KERNEL);
-	if (!ctx->host_huge_va_range) {
-		rc =  -ENOMEM;
-		goto host_huge_va_range_err;
-	}
-
-	ctx->dram_va_range = kzalloc(sizeof(*ctx->dram_va_range), GFP_KERNEL);
-	if (!ctx->dram_va_range) {
-		rc = -ENOMEM;
-		goto dram_va_range_err;
+	int i, rc;
+
+	for (i = 0 ; i < HL_VA_RANGE_TYPE_MAX ; i++) {
+		ctx->va_range[i] =
+			kzalloc(sizeof(struct hl_va_range), GFP_KERNEL);
+		if (!ctx->va_range[i]) {
+			rc = -ENOMEM;
+			goto free_va_range;
+		}
 	}
 
 	rc = hl_mmu_ctx_init(ctx);
 	if (rc) {
 		dev_err(hdev->dev, "failed to init context %d\n", ctx->asid);
-		goto mmu_ctx_err;
+		goto free_va_range;
 	}
 
 	mutex_init(&ctx->mem_hash_lock);
 	hash_init(ctx->mem_hash);
 
-	mutex_init(&ctx->host_va_range->lock);
+	mutex_init(&ctx->va_range[HL_VA_RANGE_TYPE_HOST]->lock);
 
-	rc = va_range_init(hdev, ctx->host_va_range, host_range_start,
-				host_range_end);
+	rc = va_range_init(hdev, ctx->va_range[HL_VA_RANGE_TYPE_HOST],
+			host_range_start, host_range_end, host_page_size);
 	if (rc) {
 		dev_err(hdev->dev, "failed to init host vm range\n");
-		goto host_page_range_err;
+		goto mmu_ctx_fini;
 	}
 
 	if (hdev->pmmu_huge_range) {
-		mutex_init(&ctx->host_huge_va_range->lock);
+		mutex_init(&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->lock);
 
-		rc = va_range_init(hdev, ctx->host_huge_va_range,
-					host_huge_range_start,
-					host_huge_range_end);
+		rc = va_range_init(hdev,
+			ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE],
+			host_huge_range_start, host_huge_range_end,
+			host_huge_page_size);
 		if (rc) {
 			dev_err(hdev->dev,
 				"failed to init host huge vm range\n");
-			goto host_hpage_range_err;
+			goto clear_host_va_range;
 		}
 	} else {
-		ctx->host_huge_va_range = ctx->host_va_range;
+		kfree(ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]);
+		ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE] =
+				ctx->va_range[HL_VA_RANGE_TYPE_HOST];
 	}
 
-	mutex_init(&ctx->dram_va_range->lock);
+	mutex_init(&ctx->va_range[HL_VA_RANGE_TYPE_DRAM]->lock);
 
-	rc = va_range_init(hdev, ctx->dram_va_range, dram_range_start,
-			dram_range_end);
+	rc = va_range_init(hdev, ctx->va_range[HL_VA_RANGE_TYPE_DRAM],
+			dram_range_start, dram_range_end, dram_page_size);
 	if (rc) {
 		dev_err(hdev->dev, "failed to init dram vm range\n");
-		goto dram_vm_err;
+		goto clear_host_huge_va_range;
 	}
 
 	hl_debugfs_add_ctx_mem_hash(hdev, ctx);
 
 	return 0;
 
-dram_vm_err:
-	mutex_destroy(&ctx->dram_va_range->lock);
+clear_host_huge_va_range:
+	mutex_destroy(&ctx->va_range[HL_VA_RANGE_TYPE_DRAM]->lock);
 
 	if (hdev->pmmu_huge_range) {
-		mutex_lock(&ctx->host_huge_va_range->lock);
-		clear_va_list_locked(hdev, &ctx->host_huge_va_range->list);
-		mutex_unlock(&ctx->host_huge_va_range->lock);
+		mutex_lock(&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->lock);
+		clear_va_list_locked(hdev,
+			&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->list);
+		mutex_unlock(&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->lock);
 	}
-host_hpage_range_err:
+clear_host_va_range:
 	if (hdev->pmmu_huge_range)
-		mutex_destroy(&ctx->host_huge_va_range->lock);
-	mutex_lock(&ctx->host_va_range->lock);
-	clear_va_list_locked(hdev, &ctx->host_va_range->list);
-	mutex_unlock(&ctx->host_va_range->lock);
-host_page_range_err:
-	mutex_destroy(&ctx->host_va_range->lock);
+		mutex_destroy(&ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]->lock);
+	mutex_lock(&ctx->va_range[HL_VA_RANGE_TYPE_HOST]->lock);
+	clear_va_list_locked(hdev, &ctx->va_range[HL_VA_RANGE_TYPE_HOST]->list);
+	mutex_unlock(&ctx->va_range[HL_VA_RANGE_TYPE_HOST]->lock);
+mmu_ctx_fini:
+	mutex_destroy(&ctx->va_range[HL_VA_RANGE_TYPE_HOST]->lock);
 	mutex_destroy(&ctx->mem_hash_lock);
 	hl_mmu_ctx_fini(ctx);
-mmu_ctx_err:
-	kfree(ctx->dram_va_range);
-dram_va_range_err:
-	kfree(ctx->host_huge_va_range);
-host_huge_va_range_err:
-	kfree(ctx->host_va_range);
+free_va_range:
+	for (i = 0 ; i < HL_VA_RANGE_TYPE_MAX ; i++)
+		kfree(ctx->va_range[i]);
 
 	return rc;
 }
@@ -1675,6 +1810,7 @@ int hl_vm_ctx_init(struct hl_ctx *ctx)
 	struct asic_fixed_properties *prop = &ctx->hdev->asic_prop;
 	u64 host_range_start, host_range_end, host_huge_range_start,
 		host_huge_range_end, dram_range_start, dram_range_end;
+	u32 host_page_size, host_huge_page_size, dram_page_size;
 
 	atomic64_set(&ctx->dram_phys_mem, 0);
 
@@ -1685,27 +1821,23 @@ int hl_vm_ctx_init(struct hl_ctx *ctx)
 	 *   In case of DRAM mapping, the returned address is the physical
 	 *   address of the memory related to the given handle.
 	 */
-	if (ctx->hdev->mmu_enable) {
-		dram_range_start = prop->dmmu.start_addr;
-		dram_range_end = prop->dmmu.end_addr;
-		host_range_start = prop->pmmu.start_addr;
-		host_range_end = prop->pmmu.end_addr;
-		host_huge_range_start = prop->pmmu_huge.start_addr;
-		host_huge_range_end = prop->pmmu_huge.end_addr;
-	} else {
-		dram_range_start = prop->dram_user_base_address;
-		dram_range_end = prop->dram_end_address;
-		host_range_start = prop->dram_user_base_address;
-		host_range_end = prop->dram_end_address;
-		host_huge_range_start = prop->dram_user_base_address;
-		host_huge_range_end = prop->dram_end_address;
-	}
+	if (!ctx->hdev->mmu_enable)
+		return 0;
+
+	dram_range_start = prop->dmmu.start_addr;
+	dram_range_end = prop->dmmu.end_addr;
+	dram_page_size = prop->dmmu.page_size;
+	host_range_start = prop->pmmu.start_addr;
+	host_range_end = prop->pmmu.end_addr;
+	host_page_size = prop->pmmu.page_size;
+	host_huge_range_start = prop->pmmu_huge.start_addr;
+	host_huge_range_end = prop->pmmu_huge.end_addr;
+	host_huge_page_size = prop->pmmu_huge.page_size;
 
 	return vm_ctx_init_with_ranges(ctx, host_range_start, host_range_end,
-					host_huge_range_start,
-					host_huge_range_end,
-					dram_range_start,
-					dram_range_end);
+			host_page_size, host_huge_range_start,
+			host_huge_range_end, host_huge_page_size,
+			dram_range_start, dram_range_end, dram_page_size);
 }
 
 /*
@@ -1737,6 +1869,9 @@ void hl_vm_ctx_fini(struct hl_ctx *ctx)
 	struct hlist_node *tmp_node;
 	int i;
 
+	if (!ctx->hdev->mmu_enable)
+		return;
+
 	hl_debugfs_remove_ctx_mem_hash(hdev, ctx);
 
 	/*
@@ -1771,13 +1906,21 @@ void hl_vm_ctx_fini(struct hl_ctx *ctx)
 		}
 	spin_unlock(&vm->idr_lock);
 
-	va_range_fini(hdev, ctx->dram_va_range);
+	va_range_fini(hdev, ctx->va_range[HL_VA_RANGE_TYPE_DRAM]);
+	va_range_fini(hdev, ctx->va_range[HL_VA_RANGE_TYPE_HOST]);
+
 	if (hdev->pmmu_huge_range)
-		va_range_fini(hdev, ctx->host_huge_va_range);
-	va_range_fini(hdev, ctx->host_va_range);
+		va_range_fini(hdev, ctx->va_range[HL_VA_RANGE_TYPE_HOST_HUGE]);
 
 	mutex_destroy(&ctx->mem_hash_lock);
 	hl_mmu_ctx_fini(ctx);
+
+	/* In this case we need to clear the global accounting of DRAM usage
+	 * because the user notifies us on allocations. If the user is no more,
+	 * all DRAM is available
+	 */
+	if (!ctx->hdev->asic_prop.dram_supports_virtual_memory)
+		atomic64_set(&ctx->hdev->dram_used_mem, 0);
 }
 
 /*
diff --git a/drivers/misc/habanalabs/common/mmu.c b/drivers/misc/habanalabs/common/mmu.c
index b5058798aeb9..33ae953d3a36 100644
--- a/drivers/misc/habanalabs/common/mmu.c
+++ b/drivers/misc/habanalabs/common/mmu.c
@@ -22,18 +22,25 @@ static bool is_dram_va(struct hl_device *hdev, u64 virt_addr)
  * hl_mmu_init() - initialize the MMU module.
  * @hdev: habanalabs device structure.
  *
- * This function does the following:
- * - Create a pool of pages for pgt_infos.
- * - Create a shadow table for pgt
- *
  * Return: 0 for success, non-zero for failure.
  */
 int hl_mmu_init(struct hl_device *hdev)
 {
-	if (hdev->mmu_enable)
-		return hdev->mmu_func.init(hdev);
+	int rc = -EOPNOTSUPP;
 
-	return 0;
+	if (!hdev->mmu_enable)
+		return 0;
+
+	if (hdev->mmu_func[MMU_DR_PGT].init != NULL) {
+		rc = hdev->mmu_func[MMU_DR_PGT].init(hdev);
+		if (rc)
+			return rc;
+	}
+
+	if (hdev->mmu_func[MMU_HR_PGT].init != NULL)
+		rc = hdev->mmu_func[MMU_HR_PGT].init(hdev);
+
+	return rc;
 }
 
 /**
@@ -48,8 +55,14 @@ int hl_mmu_init(struct hl_device *hdev)
  */
 void hl_mmu_fini(struct hl_device *hdev)
 {
-	if (hdev->mmu_enable)
-		hdev->mmu_func.fini(hdev);
+	if (!hdev->mmu_enable)
+		return;
+
+	if (hdev->mmu_func[MMU_DR_PGT].fini != NULL)
+		hdev->mmu_func[MMU_DR_PGT].fini(hdev);
+
+	if (hdev->mmu_func[MMU_HR_PGT].fini != NULL)
+		hdev->mmu_func[MMU_HR_PGT].fini(hdev);
 }
 
 /**
@@ -63,11 +76,23 @@ void hl_mmu_fini(struct hl_device *hdev)
 int hl_mmu_ctx_init(struct hl_ctx *ctx)
 {
 	struct hl_device *hdev = ctx->hdev;
+	int rc = -EOPNOTSUPP;
 
-	if (hdev->mmu_enable)
-		return hdev->mmu_func.ctx_init(ctx);
+	if (!hdev->mmu_enable)
+		return 0;
 
-	return 0;
+	mutex_init(&ctx->mmu_lock);
+
+	if (hdev->mmu_func[MMU_DR_PGT].ctx_init != NULL) {
+		rc = hdev->mmu_func[MMU_DR_PGT].ctx_init(ctx);
+		if (rc)
+			return rc;
+	}
+
+	if (hdev->mmu_func[MMU_HR_PGT].ctx_init != NULL)
+		rc = hdev->mmu_func[MMU_HR_PGT].ctx_init(ctx);
+
+	return rc;
 }
 
 /*
@@ -84,12 +109,20 @@ void hl_mmu_ctx_fini(struct hl_ctx *ctx)
 {
 	struct hl_device *hdev = ctx->hdev;
 
-	if (hdev->mmu_enable)
-		hdev->mmu_func.ctx_fini(ctx);
+	if (!hdev->mmu_enable)
+		return;
+
+	if (hdev->mmu_func[MMU_DR_PGT].ctx_fini != NULL)
+		hdev->mmu_func[MMU_DR_PGT].ctx_fini(ctx);
+
+	if (hdev->mmu_func[MMU_HR_PGT].ctx_fini != NULL)
+		hdev->mmu_func[MMU_HR_PGT].ctx_fini(ctx);
+
+	mutex_destroy(&ctx->mmu_lock);
 }
 
 /*
- * hl_mmu_unmap - unmaps a virtual addr
+ * hl_mmu_unmap_page - unmaps a virtual addr
  *
  * @ctx: pointer to the context structure
  * @virt_addr: virt addr to map from
@@ -109,7 +142,7 @@ void hl_mmu_ctx_fini(struct hl_ctx *ctx)
  * For optimization reasons PCI flush may be requested once after unmapping of
  * large area.
  */
-int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size,
+int hl_mmu_unmap_page(struct hl_ctx *ctx, u64 virt_addr, u32 page_size,
 		bool flush_pte)
 {
 	struct hl_device *hdev = ctx->hdev;
@@ -117,7 +150,7 @@ int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size,
 	struct hl_mmu_properties *mmu_prop;
 	u64 real_virt_addr;
 	u32 real_page_size, npages;
-	int i, rc = 0;
+	int i, rc = 0, pgt_residency;
 	bool is_dram_addr;
 
 	if (!hdev->mmu_enable)
@@ -132,6 +165,8 @@ int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size,
 	else
 		mmu_prop = &prop->pmmu;
 
+	pgt_residency = mmu_prop->host_resident ? MMU_HR_PGT : MMU_DR_PGT;
+
 	/*
 	 * The H/W handles mapping of specific page sizes. Hence if the page
 	 * size is bigger, we break it to sub-pages and unmap them separately.
@@ -150,7 +185,8 @@ int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size,
 	real_virt_addr = virt_addr;
 
 	for (i = 0 ; i < npages ; i++) {
-		rc = hdev->mmu_func.unmap(ctx, real_virt_addr, is_dram_addr);
+		rc = hdev->mmu_func[pgt_residency].unmap(ctx,
+						real_virt_addr, is_dram_addr);
 		if (rc)
 			break;
 
@@ -158,13 +194,13 @@ int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size,
 	}
 
 	if (flush_pte)
-		hdev->mmu_func.flush(ctx);
+		hdev->mmu_func[pgt_residency].flush(ctx);
 
 	return rc;
 }
 
 /*
- * hl_mmu_map - maps a virtual addr to physical addr
+ * hl_mmu_map_page - maps a virtual addr to physical addr
  *
  * @ctx: pointer to the context structure
  * @virt_addr: virt addr to map from
@@ -185,17 +221,18 @@ int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size,
  * For optimization reasons PCI flush may be requested once after mapping of
  * large area.
  */
-int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size,
-		bool flush_pte)
+int hl_mmu_map_page(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
+		u32 page_size, bool flush_pte)
 {
 	struct hl_device *hdev = ctx->hdev;
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	struct hl_mmu_properties *mmu_prop;
 	u64 real_virt_addr, real_phys_addr;
 	u32 real_page_size, npages;
-	int i, rc, mapped_cnt = 0;
+	int i, rc, pgt_residency, mapped_cnt = 0;
 	bool is_dram_addr;
 
+
 	if (!hdev->mmu_enable)
 		return 0;
 
@@ -208,6 +245,8 @@ int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size,
 	else
 		mmu_prop = &prop->pmmu;
 
+	pgt_residency = mmu_prop->host_resident ? MMU_HR_PGT : MMU_DR_PGT;
+
 	/*
 	 * The H/W handles mapping of specific page sizes. Hence if the page
 	 * size is bigger, we break it to sub-pages and map them separately.
@@ -216,7 +255,7 @@ int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size,
 		real_page_size = mmu_prop->page_size;
 	} else {
 		dev_err(hdev->dev,
-			"page size of %u is not %uKB aligned, can't unmap\n",
+			"page size of %u is not %uKB aligned, can't map\n",
 			page_size, mmu_prop->page_size >> 10);
 
 		return -EFAULT;
@@ -231,8 +270,9 @@ int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size,
 	real_phys_addr = phys_addr;
 
 	for (i = 0 ; i < npages ; i++) {
-		rc = hdev->mmu_func.map(ctx, real_virt_addr, real_phys_addr,
-				real_page_size, is_dram_addr);
+		rc = hdev->mmu_func[pgt_residency].map(ctx,
+						real_virt_addr, real_phys_addr,
+						real_page_size, is_dram_addr);
 		if (rc)
 			goto err;
 
@@ -242,21 +282,124 @@ int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size,
 	}
 
 	if (flush_pte)
-		hdev->mmu_func.flush(ctx);
+		hdev->mmu_func[pgt_residency].flush(ctx);
 
 	return 0;
 
 err:
 	real_virt_addr = virt_addr;
 	for (i = 0 ; i < mapped_cnt ; i++) {
-		if (hdev->mmu_func.unmap(ctx, real_virt_addr, is_dram_addr))
+		if (hdev->mmu_func[pgt_residency].unmap(ctx,
+						real_virt_addr, is_dram_addr))
 			dev_warn_ratelimited(hdev->dev,
 				"failed to unmap va: 0x%llx\n", real_virt_addr);
 
 		real_virt_addr += real_page_size;
 	}
 
-	hdev->mmu_func.flush(ctx);
+	hdev->mmu_func[pgt_residency].flush(ctx);
+
+	return rc;
+}
+
+/*
+ * hl_mmu_map_contiguous - implements a wrapper for hl_mmu_map_page
+ *                         for mapping contiguous physical memory
+ *
+ * @ctx: pointer to the context structure
+ * @virt_addr: virt addr to map from
+ * @phys_addr: phys addr to map to
+ * @size: size to map
+ *
+ */
+int hl_mmu_map_contiguous(struct hl_ctx *ctx, u64 virt_addr,
+					u64 phys_addr, u32 size)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	u64 curr_va, curr_pa;
+	u32 page_size;
+	bool flush_pte;
+	int rc = 0, off;
+
+	if (hl_mem_area_inside_range(virt_addr, size,
+			prop->dmmu.start_addr, prop->dmmu.end_addr))
+		page_size = prop->dmmu.page_size;
+	else if (hl_mem_area_inside_range(virt_addr, size,
+			prop->pmmu.start_addr, prop->pmmu.end_addr))
+		page_size = prop->pmmu.page_size;
+	else if (hl_mem_area_inside_range(virt_addr, size,
+			prop->pmmu_huge.start_addr, prop->pmmu_huge.end_addr))
+		page_size = prop->pmmu_huge.page_size;
+	else
+		return -EINVAL;
+
+	for (off = 0 ; off < size ; off += page_size) {
+		curr_va = virt_addr + off;
+		curr_pa = phys_addr + off;
+		flush_pte = (off + page_size) >= size;
+		rc = hl_mmu_map_page(ctx, curr_va, curr_pa, page_size,
+								flush_pte);
+		if (rc) {
+			dev_err(hdev->dev,
+				"Map failed for va 0x%llx to pa 0x%llx\n",
+				curr_va, curr_pa);
+			goto unmap;
+		}
+	}
+
+	return rc;
+
+unmap:
+	for (; off >= 0 ; off -= page_size) {
+		curr_va = virt_addr + off;
+		flush_pte = (off - (s32) page_size) < 0;
+		if (hl_mmu_unmap_page(ctx, curr_va, page_size, flush_pte))
+			dev_warn_ratelimited(hdev->dev,
+				"failed to unmap va 0x%llx\n", curr_va);
+	}
+
+	return rc;
+}
+
+/*
+ * hl_mmu_unmap_contiguous - implements a wrapper for hl_mmu_unmap_page
+ *                           for unmapping contiguous physical memory
+ *
+ * @ctx: pointer to the context structure
+ * @virt_addr: virt addr to unmap
+ * @size: size to unmap
+ *
+ */
+int hl_mmu_unmap_contiguous(struct hl_ctx *ctx, u64 virt_addr, u32 size)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	u64 curr_va;
+	u32 page_size;
+	bool flush_pte;
+	int rc = 0, off;
+
+	if (hl_mem_area_inside_range(virt_addr, size,
+			prop->dmmu.start_addr, prop->dmmu.end_addr))
+		page_size = prop->dmmu.page_size;
+	else if (hl_mem_area_inside_range(virt_addr, size,
+			prop->pmmu.start_addr, prop->pmmu.end_addr))
+		page_size = prop->pmmu.page_size;
+	else if (hl_mem_area_inside_range(virt_addr, size,
+			prop->pmmu_huge.start_addr, prop->pmmu_huge.end_addr))
+		page_size = prop->pmmu_huge.page_size;
+	else
+		return -EINVAL;
+
+	for (off = 0 ; off < size ; off += page_size) {
+		curr_va = virt_addr + off;
+		flush_pte = (off + page_size) >= size;
+		rc = hl_mmu_unmap_page(ctx, curr_va, page_size, flush_pte);
+		if (rc)
+			dev_warn_ratelimited(hdev->dev,
+				"Unmap failed for va 0x%llx\n", curr_va);
+	}
 
 	return rc;
 }
@@ -271,8 +414,14 @@ void hl_mmu_swap_out(struct hl_ctx *ctx)
 {
 	struct hl_device *hdev = ctx->hdev;
 
-	if (hdev->mmu_enable)
-		hdev->mmu_func.swap_out(ctx);
+	if (!hdev->mmu_enable)
+		return;
+
+	if (hdev->mmu_func[MMU_DR_PGT].swap_out != NULL)
+		hdev->mmu_func[MMU_DR_PGT].swap_out(ctx);
+
+	if (hdev->mmu_func[MMU_HR_PGT].swap_out != NULL)
+		hdev->mmu_func[MMU_HR_PGT].swap_out(ctx);
 }
 
 /*
@@ -285,8 +434,64 @@ void hl_mmu_swap_in(struct hl_ctx *ctx)
 {
 	struct hl_device *hdev = ctx->hdev;
 
-	if (hdev->mmu_enable)
-		hdev->mmu_func.swap_in(ctx);
+	if (!hdev->mmu_enable)
+		return;
+
+	if (hdev->mmu_func[MMU_DR_PGT].swap_in != NULL)
+		hdev->mmu_func[MMU_DR_PGT].swap_in(ctx);
+
+	if (hdev->mmu_func[MMU_HR_PGT].swap_in != NULL)
+		hdev->mmu_func[MMU_HR_PGT].swap_in(ctx);
+}
+
+int hl_mmu_va_to_pa(struct hl_ctx *ctx, u64 virt_addr, u64 *phys_addr)
+{
+	struct hl_mmu_hop_info hops;
+	u64 tmp_addr;
+	int rc;
+
+	rc = hl_mmu_get_tlb_info(ctx, virt_addr, &hops);
+	if (rc)
+		return rc;
+
+	/* last hop holds the phys address and flags */
+	tmp_addr = hops.hop_info[hops.used_hops - 1].hop_pte_val;
+	*phys_addr = (tmp_addr & HOP_PHYS_ADDR_MASK) | (virt_addr & FLAGS_MASK);
+
+	return 0;
+}
+
+int hl_mmu_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr,
+			struct hl_mmu_hop_info *hops)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct hl_mmu_properties *mmu_prop;
+	int rc;
+	bool is_dram_addr;
+
+	if (!hdev->mmu_enable)
+		return -EOPNOTSUPP;
+
+	is_dram_addr = hl_mem_area_inside_range(virt_addr, prop->dmmu.page_size,
+						prop->dmmu.start_addr,
+						prop->dmmu.end_addr);
+
+	/* host-residency is the same in PMMU and HPMMU, use one of them */
+	mmu_prop = is_dram_addr ? &prop->dmmu : &prop->pmmu;
+
+	mutex_lock(&ctx->mmu_lock);
+
+	if (mmu_prop->host_resident)
+		rc = hdev->mmu_func[MMU_HR_PGT].get_tlb_info(ctx,
+							virt_addr, hops);
+	else
+		rc = hdev->mmu_func[MMU_DR_PGT].get_tlb_info(ctx,
+							virt_addr, hops);
+
+	mutex_unlock(&ctx->mmu_lock);
+
+	return rc;
 }
 
 int hl_mmu_if_set_funcs(struct hl_device *hdev)
@@ -297,7 +502,7 @@ int hl_mmu_if_set_funcs(struct hl_device *hdev)
 	switch (hdev->asic_type) {
 	case ASIC_GOYA:
 	case ASIC_GAUDI:
-		hl_mmu_v1_set_funcs(hdev);
+		hl_mmu_v1_set_funcs(hdev, &hdev->mmu_func[MMU_DR_PGT]);
 		break;
 	default:
 		dev_err(hdev->dev, "Unrecognized ASIC type %d\n",
diff --git a/drivers/misc/habanalabs/common/mmu_v1.c b/drivers/misc/habanalabs/common/mmu_v1.c
index 8d1eb5265419..2ce6ea89d4fa 100644
--- a/drivers/misc/habanalabs/common/mmu_v1.c
+++ b/drivers/misc/habanalabs/common/mmu_v1.c
@@ -8,7 +8,6 @@
 #include "habanalabs.h"
 #include "../include/hw_ip/mmu/mmu_general.h"
 
-#include <linux/genalloc.h>
 #include <linux/slab.h>
 
 static inline u64 get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr);
@@ -29,7 +28,7 @@ static void _free_hop(struct hl_ctx *ctx, struct pgt_info *pgt_info)
 {
 	struct hl_device *hdev = ctx->hdev;
 
-	gen_pool_free(hdev->mmu_priv.mmu_pgt_pool, pgt_info->phys_addr,
+	gen_pool_free(hdev->mmu_priv.dr.mmu_pgt_pool, pgt_info->phys_addr,
 			hdev->asic_prop.mmu_hop_table_size);
 	hash_del(&pgt_info->node);
 	kfree((u64 *) (uintptr_t) pgt_info->shadow_addr);
@@ -54,7 +53,7 @@ static u64 alloc_hop(struct hl_ctx *ctx)
 	if (!pgt_info)
 		return ULLONG_MAX;
 
-	phys_addr = (u64) gen_pool_alloc(hdev->mmu_priv.mmu_pgt_pool,
+	phys_addr = (u64) gen_pool_alloc(hdev->mmu_priv.dr.mmu_pgt_pool,
 					prop->mmu_hop_table_size);
 	if (!phys_addr) {
 		dev_err(hdev->dev, "failed to allocate page\n");
@@ -75,7 +74,7 @@ static u64 alloc_hop(struct hl_ctx *ctx)
 	return shadow_addr;
 
 shadow_err:
-	gen_pool_free(hdev->mmu_priv.mmu_pgt_pool, phys_addr,
+	gen_pool_free(hdev->mmu_priv.dr.mmu_pgt_pool, phys_addr,
 			prop->mmu_hop_table_size);
 pool_add_err:
 	kfree(pgt_info);
@@ -91,7 +90,7 @@ static inline u64 get_phys_hop0_addr(struct hl_ctx *ctx)
 
 static inline u64 get_hop0_addr(struct hl_ctx *ctx)
 {
-	return (u64) (uintptr_t) ctx->hdev->mmu_priv.mmu_shadow_hop0 +
+	return (u64) (uintptr_t) ctx->hdev->mmu_priv.dr.mmu_shadow_hop0 +
 			(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
 }
 
@@ -263,7 +262,7 @@ static int dram_default_mapping_init(struct hl_ctx *ctx)
 		hop2_pte_addr, hop3_pte_addr, pte_val;
 	int rc, i, j, hop3_allocated = 0;
 
-	if ((!hdev->dram_supports_virtual_memory) ||
+	if ((!prop->dram_supports_virtual_memory) ||
 			(!hdev->dram_default_page_mapping) ||
 			(ctx->asid == HL_KERNEL_ASID_ID))
 		return 0;
@@ -363,7 +362,7 @@ static void dram_default_mapping_fini(struct hl_ctx *ctx)
 		hop2_pte_addr, hop3_pte_addr;
 	int i, j;
 
-	if ((!hdev->dram_supports_virtual_memory) ||
+	if ((!prop->dram_supports_virtual_memory) ||
 			(!hdev->dram_default_page_mapping) ||
 			(ctx->asid == HL_KERNEL_ASID_ID))
 		return;
@@ -419,15 +418,15 @@ static int hl_mmu_v1_init(struct hl_device *hdev)
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	int rc;
 
-	hdev->mmu_priv.mmu_pgt_pool =
+	hdev->mmu_priv.dr.mmu_pgt_pool =
 			gen_pool_create(__ffs(prop->mmu_hop_table_size), -1);
 
-	if (!hdev->mmu_priv.mmu_pgt_pool) {
+	if (!hdev->mmu_priv.dr.mmu_pgt_pool) {
 		dev_err(hdev->dev, "Failed to create page gen pool\n");
 		return -ENOMEM;
 	}
 
-	rc = gen_pool_add(hdev->mmu_priv.mmu_pgt_pool, prop->mmu_pgt_addr +
+	rc = gen_pool_add(hdev->mmu_priv.dr.mmu_pgt_pool, prop->mmu_pgt_addr +
 			prop->mmu_hop0_tables_total_size,
 			prop->mmu_pgt_size - prop->mmu_hop0_tables_total_size,
 			-1);
@@ -436,10 +435,10 @@ static int hl_mmu_v1_init(struct hl_device *hdev)
 		goto err_pool_add;
 	}
 
-	hdev->mmu_priv.mmu_shadow_hop0 = kvmalloc_array(prop->max_asid,
+	hdev->mmu_priv.dr.mmu_shadow_hop0 = kvmalloc_array(prop->max_asid,
 						prop->mmu_hop_table_size,
 						GFP_KERNEL | __GFP_ZERO);
-	if (ZERO_OR_NULL_PTR(hdev->mmu_priv.mmu_shadow_hop0)) {
+	if (ZERO_OR_NULL_PTR(hdev->mmu_priv.dr.mmu_shadow_hop0)) {
 		rc = -ENOMEM;
 		goto err_pool_add;
 	}
@@ -449,7 +448,7 @@ static int hl_mmu_v1_init(struct hl_device *hdev)
 	return 0;
 
 err_pool_add:
-	gen_pool_destroy(hdev->mmu_priv.mmu_pgt_pool);
+	gen_pool_destroy(hdev->mmu_priv.dr.mmu_pgt_pool);
 
 	return rc;
 }
@@ -468,8 +467,8 @@ static void hl_mmu_v1_fini(struct hl_device *hdev)
 {
 	/* MMU H/W fini was already done in device hw_fini() */
 
-	kvfree(hdev->mmu_priv.mmu_shadow_hop0);
-	gen_pool_destroy(hdev->mmu_priv.mmu_pgt_pool);
+	kvfree(hdev->mmu_priv.dr.mmu_shadow_hop0);
+	gen_pool_destroy(hdev->mmu_priv.dr.mmu_pgt_pool);
 }
 
 /**
@@ -482,9 +481,7 @@ static void hl_mmu_v1_fini(struct hl_device *hdev)
  */
 static int hl_mmu_v1_ctx_init(struct hl_ctx *ctx)
 {
-	mutex_init(&ctx->mmu_lock);
 	hash_init(ctx->mmu_shadow_hash);
-
 	return dram_default_mapping_init(ctx);
 }
 
@@ -517,8 +514,6 @@ static void hl_mmu_v1_ctx_fini(struct hl_ctx *ctx)
 			pgt_info->phys_addr, ctx->asid, pgt_info->num_of_ptes);
 		_free_hop(ctx, pgt_info);
 	}
-
-	mutex_destroy(&ctx->mmu_lock);
 }
 
 static int _hl_mmu_v1_unmap(struct hl_ctx *ctx,
@@ -842,15 +837,114 @@ static void hl_mmu_v1_swap_in(struct hl_ctx *ctx)
 
 }
 
+static inline u64 get_hop_pte_addr(struct hl_ctx *ctx,
+				struct hl_mmu_properties *mmu_prop,
+				int hop_num, u64 hop_addr, u64 virt_addr)
+{
+	switch (hop_num) {
+	case 0:
+		return get_hop0_pte_addr(ctx, mmu_prop, hop_addr, virt_addr);
+	case 1:
+		return get_hop1_pte_addr(ctx, mmu_prop, hop_addr, virt_addr);
+	case 2:
+		return get_hop2_pte_addr(ctx, mmu_prop, hop_addr, virt_addr);
+	case 3:
+		return get_hop3_pte_addr(ctx, mmu_prop, hop_addr, virt_addr);
+	case 4:
+		return get_hop4_pte_addr(ctx, mmu_prop, hop_addr, virt_addr);
+	default:
+		break;
+	}
+	return U64_MAX;
+}
+
+static int hl_mmu_v1_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr,
+				struct hl_mmu_hop_info *hops)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct hl_mmu_properties *mmu_prop;
+	bool is_dram_addr, is_pmmu_addr, is_pmmu_h_addr, is_huge;
+	int i, used_hops;
+
+	is_dram_addr = hl_mem_area_inside_range(virt_addr, prop->dmmu.page_size,
+						prop->dmmu.start_addr,
+						prop->dmmu.end_addr);
+	is_pmmu_addr = hl_mem_area_inside_range(virt_addr, prop->pmmu.page_size,
+						prop->pmmu.start_addr,
+						prop->pmmu.end_addr);
+	is_pmmu_h_addr = hl_mem_area_inside_range(virt_addr,
+						prop->pmmu_huge.page_size,
+						prop->pmmu_huge.start_addr,
+						prop->pmmu_huge.end_addr);
+	if (is_dram_addr) {
+		mmu_prop = &prop->dmmu;
+		is_huge = true;
+	} else if (is_pmmu_addr) {
+		mmu_prop = &prop->pmmu;
+		is_huge = false;
+	} else if (is_pmmu_h_addr) {
+		mmu_prop = &prop->pmmu_huge;
+		is_huge = true;
+	} else {
+		return -EINVAL;
+	}
+
+	used_hops = mmu_prop->num_hops;
+
+	/* huge pages use lesser hops */
+	if (is_huge)
+		used_hops--;
+
+	hops->hop_info[0].hop_addr = get_phys_hop0_addr(ctx);
+	hops->hop_info[0].hop_pte_addr =
+			get_hop_pte_addr(ctx, mmu_prop, 0,
+					hops->hop_info[0].hop_addr, virt_addr);
+	hops->hop_info[0].hop_pte_val =
+			hdev->asic_funcs->read_pte(hdev,
+						hops->hop_info[0].hop_pte_addr);
+
+	for (i = 1 ; i < used_hops ; i++) {
+		hops->hop_info[i].hop_addr =
+			get_next_hop_addr(ctx,
+					hops->hop_info[i - 1].hop_pte_val);
+		if (hops->hop_info[i].hop_addr == ULLONG_MAX)
+			return -EFAULT;
+
+		hops->hop_info[i].hop_pte_addr =
+				get_hop_pte_addr(ctx, mmu_prop, i,
+						hops->hop_info[i].hop_addr,
+						virt_addr);
+		hops->hop_info[i].hop_pte_val =
+				hdev->asic_funcs->read_pte(hdev,
+						hops->hop_info[i].hop_pte_addr);
+
+		if (!(hops->hop_info[i].hop_pte_val & PAGE_PRESENT_MASK))
+			return -EFAULT;
+
+		if (hops->hop_info[i].hop_pte_val & LAST_MASK)
+			break;
+	}
+
+	/* if passed over all hops then no last hop was found */
+	if (i == mmu_prop->num_hops)
+		return -EFAULT;
+
+	if (!(hops->hop_info[i].hop_pte_val & PAGE_PRESENT_MASK))
+		return -EFAULT;
+
+	hops->used_hops = i + 1;
+
+	return 0;
+}
+
 /*
  * hl_mmu_v1_prepare - prepare mmu  for working with mmu v1
  *
  * @hdev: pointer to the device structure
  */
-void hl_mmu_v1_set_funcs(struct hl_device *hdev)
+void hl_mmu_v1_set_funcs(struct hl_device *hdev, struct hl_mmu_funcs *mmu)
 {
-	struct hl_mmu_funcs *mmu = &hdev->mmu_func;
-
 	mmu->init = hl_mmu_v1_init;
 	mmu->fini = hl_mmu_v1_fini;
 	mmu->ctx_init = hl_mmu_v1_ctx_init;
@@ -860,4 +954,5 @@ void hl_mmu_v1_set_funcs(struct hl_device *hdev)
 	mmu->flush = flush;
 	mmu->swap_out = hl_mmu_v1_swap_out;
 	mmu->swap_in = hl_mmu_v1_swap_in;
+	mmu->get_tlb_info = hl_mmu_v1_get_tlb_info;
 }
diff --git a/drivers/misc/habanalabs/common/pci.c b/drivers/misc/habanalabs/common/pci.c
index 4327e5704ebb..923b2606e29f 100644
--- a/drivers/misc/habanalabs/common/pci.c
+++ b/drivers/misc/habanalabs/common/pci.c
@@ -338,17 +338,12 @@ static int hl_pci_set_dma_mask(struct hl_device *hdev)
 /**
  * hl_pci_init() - PCI initialization code.
  * @hdev: Pointer to hl_device structure.
- * @cpu_boot_status_reg: status register of the device's CPU
- * @boot_err0_reg: boot error register of the device's CPU
- * @preboot_ver_timeout: how much to wait before bailing out on reading
- *                       the preboot version
  *
  * Set DMA masks, initialize the PCI controller and map the PCI BARs.
  *
  * Return: 0 on success, non-zero for failure.
  */
-int hl_pci_init(struct hl_device *hdev, u32 cpu_boot_status_reg,
-		u32 boot_err0_reg, u32 preboot_ver_timeout)
+int hl_pci_init(struct hl_device *hdev)
 {
 	struct pci_dev *pdev = hdev->pdev;
 	int rc;
@@ -380,15 +375,6 @@ int hl_pci_init(struct hl_device *hdev, u32 cpu_boot_status_reg,
 	if (rc)
 		goto unmap_pci_bars;
 
-	/* Before continuing in the initialization, we need to read the preboot
-	 * version to determine whether we run with a security-enabled firmware
-	 * The check will be done in each ASIC's specific code
-	 */
-	rc = hl_fw_read_preboot_ver(hdev, cpu_boot_status_reg, boot_err0_reg,
-					preboot_ver_timeout);
-	if (rc)
-		goto unmap_pci_bars;
-
 	return 0;
 
 unmap_pci_bars:
diff --git a/drivers/misc/habanalabs/common/sysfs.c b/drivers/misc/habanalabs/common/sysfs.c
index 3ceae87016b1..4366d8f93842 100644
--- a/drivers/misc/habanalabs/common/sysfs.c
+++ b/drivers/misc/habanalabs/common/sysfs.c
@@ -12,7 +12,7 @@
 long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr)
 {
 	struct cpucp_packet pkt;
-	long result;
+	u64 result;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
@@ -32,10 +32,10 @@ long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr)
 		dev_err(hdev->dev,
 			"Failed to get frequency of PLL %d, error %d\n",
 			pll_index, rc);
-		result = rc;
+		return rc;
 	}
 
-	return result;
+	return (long) result;
 }
 
 void hl_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq)
@@ -62,7 +62,7 @@ void hl_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq)
 u64 hl_get_max_power(struct hl_device *hdev)
 {
 	struct cpucp_packet pkt;
-	long result;
+	u64 result;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
@@ -75,7 +75,7 @@ u64 hl_get_max_power(struct hl_device *hdev)
 
 	if (rc) {
 		dev_err(hdev->dev, "Failed to get max power, error %d\n", rc);
-		result = rc;
+		return (u64) rc;
 	}
 
 	return result;
@@ -276,6 +276,8 @@ static ssize_t status_show(struct device *dev, struct device_attribute *attr,
 		str = "In reset";
 	else if (hdev->disabled)
 		str = "Malfunction";
+	else if (hdev->needs_reset)
+		str = "Needs Reset";
 	else
 		str = "Operational";
 
@@ -304,7 +306,7 @@ static ssize_t max_power_show(struct device *dev, struct device_attribute *attr,
 	struct hl_device *hdev = dev_get_drvdata(dev);
 	long val;
 
-	if (hl_device_disabled_or_in_reset(hdev))
+	if (!hl_device_operational(hdev, NULL))
 		return -ENODEV;
 
 	val = hl_get_max_power(hdev);
@@ -319,7 +321,7 @@ static ssize_t max_power_store(struct device *dev,
 	unsigned long value;
 	int rc;
 
-	if (hl_device_disabled_or_in_reset(hdev)) {
+	if (!hl_device_operational(hdev, NULL)) {
 		count = -ENODEV;
 		goto out;
 	}
@@ -347,7 +349,7 @@ static ssize_t eeprom_read_handler(struct file *filp, struct kobject *kobj,
 	char *data;
 	int rc;
 
-	if (hl_device_disabled_or_in_reset(hdev))
+	if (!hl_device_operational(hdev, NULL))
 		return -ENODEV;
 
 	if (!max_size)