Merge remote-tracking branch 'origin/master' into perf/core

Further perf/core patches will depend on: d3f7b1bb2040 ("mm/gup: fix gup_fast with dynamic page table folding") which is already in Linus' tree.
author: Peter Zijlstra <peterz@infradead.org> 2020-11-26 13:16:55 +0100
committer: Peter Zijlstra <peterz@infradead.org> 2020-11-26 13:16:55 +0100
commit: 20c7775aecea04d8ca322039969d49dcf568e0e9 (patch)
tree: 138c057839197c9021043353e994815c0250e669 /drivers/misc/habanalabs/common
parent: perf/x86/intel: Add event constraint for CYCLE_ACTIVITY.STALLS_MEM_ANY (diff)
parent: Merge tag 'media/v5.10-2' of git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-media (diff)
download: linux-dev-20c7775aecea04d8ca322039969d49dcf568e0e9.tar.xz
linux-dev-20c7775aecea04d8ca322039969d49dcf568e0e9.zip
18 files changed, 2024 insertions, 1137 deletions
diff --git a/drivers/misc/habanalabs/common/Makefile b/drivers/misc/habanalabs/common/Makefile
index b984bfa4face..eccd8c7dc62d 100644
--- a/drivers/misc/habanalabs/common/Makefile
+++ b/drivers/misc/habanalabs/common/Makefile
@@ -3,5 +3,5 @@ HL_COMMON_FILES := common/habanalabs_drv.o common/device.o common/context.o \
 		common/asid.o common/habanalabs_ioctl.o \
 		common/command_buffer.o common/hw_queue.o common/irq.o \
 		common/sysfs.o common/hwmon.o common/memory.o \
-		common/command_submission.o common/mmu.o common/firmware_if.o \
-		common/pci.o
+		common/command_submission.o common/mmu.o common/mmu_v1.o \
+		common/firmware_if.o common/pci.o
diff --git a/drivers/misc/habanalabs/common/command_buffer.c b/drivers/misc/habanalabs/common/command_buffer.c
index 7c38c4f7f9c0..ada570f35a41 100644
--- a/drivers/misc/habanalabs/common/command_buffer.c
+++ b/drivers/misc/habanalabs/common/command_buffer.c
@@ -10,17 +10,142 @@
 
 #include <linux/mm.h>
 #include <linux/slab.h>
+#include <linux/uaccess.h>
 #include <linux/genalloc.h>
 
+static int cb_map_mem(struct hl_ctx *ctx, struct hl_cb *cb)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct hl_vm_va_block *va_block, *tmp;
+	dma_addr_t bus_addr;
+	u64 virt_addr;
+	u32 page_size = prop->pmmu.page_size;
+	s32 offset;
+	int rc;
+
+	if (!hdev->supports_cb_mapping) {
+		dev_err_ratelimited(hdev->dev,
+				"Cannot map CB because no VA range is allocated for CB mapping\n");
+		return -EINVAL;
+	}
+
+	if (!hdev->mmu_enable) {
+		dev_err_ratelimited(hdev->dev,
+				"Cannot map CB because MMU is disabled\n");
+		return -EINVAL;
+	}
+
+	INIT_LIST_HEAD(&cb->va_block_list);
+
+	for (bus_addr = cb->bus_address;
+			bus_addr < cb->bus_address + cb->size;
+			bus_addr += page_size) {
+
+		virt_addr = (u64) gen_pool_alloc(ctx->cb_va_pool, page_size);
+		if (!virt_addr) {
+			dev_err(hdev->dev,
+				"Failed to allocate device virtual address for CB\n");
+			rc = -ENOMEM;
+			goto err_va_pool_free;
+		}
+
+		va_block = kzalloc(sizeof(*va_block), GFP_KERNEL);
+		if (!va_block) {
+			rc = -ENOMEM;
+			gen_pool_free(ctx->cb_va_pool, virt_addr, page_size);
+			goto err_va_pool_free;
+		}
+
+		va_block->start = virt_addr;
+		va_block->end = virt_addr + page_size;
+		va_block->size = page_size;
+		list_add_tail(&va_block->node, &cb->va_block_list);
+	}
+
+	mutex_lock(&ctx->mmu_lock);
+
+	bus_addr = cb->bus_address;
+	offset = 0;
+	list_for_each_entry(va_block, &cb->va_block_list, node) {
+		rc = hl_mmu_map(ctx, va_block->start, bus_addr, va_block->size,
+				list_is_last(&va_block->node,
+						&cb->va_block_list));
+		if (rc) {
+			dev_err(hdev->dev, "Failed to map VA %#llx to CB\n",
+				va_block->start);
+			goto err_va_umap;
+		}
+
+		bus_addr += va_block->size;
+		offset += va_block->size;
+	}
+
+	hdev->asic_funcs->mmu_invalidate_cache(hdev, false, VM_TYPE_USERPTR);
+
+	mutex_unlock(&ctx->mmu_lock);
+
+	cb->is_mmu_mapped = true;
+
+	return 0;
+
+err_va_umap:
+	list_for_each_entry(va_block, &cb->va_block_list, node) {
+		if (offset <= 0)
+			break;
+		hl_mmu_unmap(ctx, va_block->start, va_block->size,
+				offset <= va_block->size);
+		offset -= va_block->size;
+	}
+
+	hdev->asic_funcs->mmu_invalidate_cache(hdev, true, VM_TYPE_USERPTR);
+
+	mutex_unlock(&ctx->mmu_lock);
+
+err_va_pool_free:
+	list_for_each_entry_safe(va_block, tmp, &cb->va_block_list, node) {
+		gen_pool_free(ctx->cb_va_pool, va_block->start, va_block->size);
+		list_del(&va_block->node);
+		kfree(va_block);
+	}
+
+	return rc;
+}
+
+static void cb_unmap_mem(struct hl_ctx *ctx, struct hl_cb *cb)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct hl_vm_va_block *va_block, *tmp;
+
+	mutex_lock(&ctx->mmu_lock);
+
+	list_for_each_entry(va_block, &cb->va_block_list, node)
+		if (hl_mmu_unmap(ctx, va_block->start, va_block->size,
+				list_is_last(&va_block->node,
+						&cb->va_block_list)))
+			dev_warn_ratelimited(hdev->dev,
+					"Failed to unmap CB's va 0x%llx\n",
+					va_block->start);
+
+	hdev->asic_funcs->mmu_invalidate_cache(hdev, true, VM_TYPE_USERPTR);
+
+	mutex_unlock(&ctx->mmu_lock);
+
+	list_for_each_entry_safe(va_block, tmp, &cb->va_block_list, node) {
+		gen_pool_free(ctx->cb_va_pool, va_block->start, va_block->size);
+		list_del(&va_block->node);
+		kfree(va_block);
+	}
+}
+
 static void cb_fini(struct hl_device *hdev, struct hl_cb *cb)
 {
 	if (cb->is_internal)
 		gen_pool_free(hdev->internal_cb_pool,
-				cb->kernel_address, cb->size);
+				(uintptr_t)cb->kernel_address, cb->size);
 	else
 		hdev->asic_funcs->asic_dma_free_coherent(hdev, cb->size,
-				(void *) (uintptr_t) cb->kernel_address,
-				cb->bus_address);
+				cb->kernel_address, cb->bus_address);
 
 	kfree(cb);
 }
@@ -46,6 +171,11 @@ static void cb_release(struct kref *ref)
 
 	hl_debugfs_remove_cb(cb);
 
+	if (cb->is_mmu_mapped)
+		cb_unmap_mem(cb->ctx, cb);
+
+	hl_ctx_put(cb->ctx);
+
 	cb_do_release(hdev, cb);
 }
 
@@ -99,18 +229,19 @@ static struct hl_cb *hl_cb_alloc(struct hl_device *hdev, u32 cb_size,
 		return NULL;
 	}
 
-	cb->kernel_address = (u64) (uintptr_t) p;
+	cb->kernel_address = p;
 	cb->size = cb_size;
 
 	return cb;
 }
 
 int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
-			u32 cb_size, u64 *handle, int ctx_id, bool internal_cb)
+			struct hl_ctx *ctx, u32 cb_size, bool internal_cb,
+			bool map_cb, u64 *handle)
 {
 	struct hl_cb *cb;
 	bool alloc_new_cb = true;
-	int rc;
+	int rc, ctx_id = ctx->asid;
 
 	/*
 	 * Can't use generic function to check this because of special case
@@ -162,7 +293,21 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
 	}
 
 	cb->hdev = hdev;
-	cb->ctx_id = ctx_id;
+	cb->ctx = ctx;
+	hl_ctx_get(hdev, cb->ctx);
+
+	if (map_cb) {
+		if (ctx_id == HL_KERNEL_ASID_ID) {
+			dev_err(hdev->dev,
+				"CB mapping is not supported for kernel context\n");
+			rc = -EINVAL;
+			goto release_cb;
+		}
+
+		rc = cb_map_mem(ctx, cb);
+		if (rc)
+			goto release_cb;
+	}
 
 	spin_lock(&mgr->cb_lock);
 	rc = idr_alloc(&mgr->cb_handles, cb, 1, 0, GFP_ATOMIC);
@@ -170,10 +315,10 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
 
 	if (rc < 0) {
 		dev_err(hdev->dev, "Failed to allocate IDR for a new CB\n");
-		goto release_cb;
+		goto unmap_mem;
 	}
 
-	cb->id = rc;
+	cb->id = (u64) rc;
 
 	kref_init(&cb->refcount);
 	spin_lock_init(&cb->lock);
@@ -182,14 +327,18 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
 	 * idr is 32-bit so we can safely OR it with a mask that is above
 	 * 32 bit
 	 */
-	*handle = cb->id | HL_MMAP_CB_MASK;
+	*handle = cb->id | HL_MMAP_TYPE_CB;
 	*handle <<= PAGE_SHIFT;
 
 	hl_debugfs_add_cb(cb);
 
 	return 0;
 
+unmap_mem:
+	if (cb->is_mmu_mapped)
+		cb_unmap_mem(cb->ctx, cb);
 release_cb:
+	hl_ctx_put(cb->ctx);
 	cb_do_release(hdev, cb);
 out_err:
 	*handle = 0;
@@ -249,9 +398,10 @@ int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data)
 				args->in.cb_size, HL_MAX_CB_SIZE);
 			rc = -EINVAL;
 		} else {
-			rc = hl_cb_create(hdev, &hpriv->cb_mgr,
-					args->in.cb_size, &handle,
-					hpriv->ctx->asid, false);
+			rc = hl_cb_create(hdev, &hpriv->cb_mgr, hpriv->ctx,
+					args->in.cb_size, false,
+					!!(args->in.flags & HL_CB_FLAGS_MAP),
+					&handle);
 		}
 
 		memset(args, 0, sizeof(*args));
@@ -299,11 +449,14 @@ int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma)
 {
 	struct hl_device *hdev = hpriv->hdev;
 	struct hl_cb *cb;
-	phys_addr_t address;
-	u32 handle;
+	u32 handle, user_cb_size;
 	int rc;
 
+	/* We use the page offset to hold the idr and thus we need to clear
+	 * it before doing the mmap itself
+	 */
 	handle = vma->vm_pgoff;
+	vma->vm_pgoff = 0;
 
 	/* reference was taken here */
 	cb = hl_cb_get(hdev, &hpriv->cb_mgr, handle);
@@ -314,7 +467,8 @@ int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma)
 	}
 
 	/* Validation check */
-	if ((vma->vm_end - vma->vm_start) != ALIGN(cb->size, PAGE_SIZE)) {
+	user_cb_size = vma->vm_end - vma->vm_start;
+	if (user_cb_size != ALIGN(cb->size, PAGE_SIZE)) {
 		dev_err(hdev->dev,
 			"CB mmap failed, mmap size 0x%lx != 0x%x cb size\n",
 			vma->vm_end - vma->vm_start, cb->size);
@@ -322,6 +476,16 @@ int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma)
 		goto put_cb;
 	}
 
+	if (!access_ok((void __user *) (uintptr_t) vma->vm_start,
+							user_cb_size)) {
+		dev_err(hdev->dev,
+			"user pointer is invalid - 0x%lx\n",
+			vma->vm_start);
+
+		rc = -EINVAL;
+		goto put_cb;
+	}
+
 	spin_lock(&cb->lock);
 
 	if (cb->mmap) {
@@ -344,12 +508,8 @@ int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma)
 
 	vma->vm_private_data = cb;
 
-	/* Calculate address for CB */
-	address = virt_to_phys((void *) (uintptr_t) cb->kernel_address);
-
 	rc = hdev->asic_funcs->cb_mmap(hdev, vma, cb->kernel_address,
-					address, cb->size);
-
+					cb->bus_address, cb->size);
 	if (rc) {
 		spin_lock(&cb->lock);
 		cb->mmap = false;
@@ -413,7 +573,7 @@ void hl_cb_mgr_fini(struct hl_device *hdev, struct hl_cb_mgr *mgr)
 		if (kref_put(&cb->refcount, cb_release) != 1)
 			dev_err(hdev->dev,
 				"CB %d for CTX ID %d is still alive\n",
-				id, cb->ctx_id);
+				id, cb->ctx->asid);
 	}
 
 	idr_destroy(&mgr->cb_handles);
@@ -426,8 +586,8 @@ struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size,
 	struct hl_cb *cb;
 	int rc;
 
-	rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, cb_size, &cb_handle,
-			HL_KERNEL_ASID_ID, internal_cb);
+	rc = hl_cb_create(hdev, &hdev->kernel_cb_mgr, hdev->kernel_ctx, cb_size,
+				internal_cb, false, &cb_handle);
 	if (rc) {
 		dev_err(hdev->dev,
 			"Failed to allocate CB for the kernel driver %d\n", rc);
@@ -483,3 +643,45 @@ int hl_cb_pool_fini(struct hl_device *hdev)
 
 	return 0;
 }
+
+int hl_cb_va_pool_init(struct hl_ctx *ctx)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	int rc;
+
+	if (!hdev->supports_cb_mapping)
+		return 0;
+
+	ctx->cb_va_pool = gen_pool_create(__ffs(prop->pmmu.page_size), -1);
+	if (!ctx->cb_va_pool) {
+		dev_err(hdev->dev,
+			"Failed to create VA gen pool for CB mapping\n");
+		return -ENOMEM;
+	}
+
+	rc = gen_pool_add(ctx->cb_va_pool, prop->cb_va_start_addr,
+			prop->cb_va_end_addr - prop->cb_va_start_addr, -1);
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed to add memory to VA gen pool for CB mapping\n");
+		goto err_pool_destroy;
+	}
+
+	return 0;
+
+err_pool_destroy:
+	gen_pool_destroy(ctx->cb_va_pool);
+
+	return rc;
+}
+
+void hl_cb_va_pool_fini(struct hl_ctx *ctx)
+{
+	struct hl_device *hdev = ctx->hdev;
+
+	if (!hdev->supports_cb_mapping)
+		return;
+
+	gen_pool_destroy(ctx->cb_va_pool);
+}
diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index b9840e368eb5..b2b974ecc431 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -38,26 +38,10 @@ void hl_sob_reset_error(struct kref *ref)
 			hw_sob->q_idx, hw_sob->sob_id);
 }
 
-static const char *hl_fence_get_driver_name(struct dma_fence *fence)
-{
-	return "HabanaLabs";
-}
-
-static const char *hl_fence_get_timeline_name(struct dma_fence *fence)
-{
-	struct hl_cs_compl *hl_cs_compl =
-		container_of(fence, struct hl_cs_compl, base_fence);
-
-	return dev_name(hl_cs_compl->hdev->dev);
-}
-
-static bool hl_fence_enable_signaling(struct dma_fence *fence)
-{
-	return true;
-}
-
-static void hl_fence_release(struct dma_fence *fence)
+static void hl_fence_release(struct kref *kref)
 {
+	struct hl_fence *fence =
+		container_of(kref, struct hl_fence, refcount);
 	struct hl_cs_compl *hl_cs_cmpl =
 		container_of(fence, struct hl_cs_compl, base_fence);
 	struct hl_device *hdev = hl_cs_cmpl->hdev;
@@ -99,15 +83,27 @@ static void hl_fence_release(struct dma_fence *fence)
 	}
 
 free:
-	kfree_rcu(hl_cs_cmpl, base_fence.rcu);
+	kfree(hl_cs_cmpl);
+}
+
+void hl_fence_put(struct hl_fence *fence)
+{
+	if (fence)
+		kref_put(&fence->refcount, hl_fence_release);
+}
+
+void hl_fence_get(struct hl_fence *fence)
+{
+	if (fence)
+		kref_get(&fence->refcount);
 }
 
-static const struct dma_fence_ops hl_fence_ops = {
-	.get_driver_name = hl_fence_get_driver_name,
-	.get_timeline_name = hl_fence_get_timeline_name,
-	.enable_signaling = hl_fence_enable_signaling,
-	.release = hl_fence_release
-};
+static void hl_fence_init(struct hl_fence *fence)
+{
+	kref_init(&fence->refcount);
+	fence->error = 0;
+	init_completion(&fence->completion);
+}
 
 static void cs_get(struct hl_cs *cs)
 {
@@ -256,6 +252,8 @@ static void cs_counters_aggregate(struct hl_device *hdev, struct hl_ctx *ctx)
 			ctx->cs_counters.parsing_drop_cnt;
 	hdev->aggregated_cs_counters.queue_full_drop_cnt +=
 			ctx->cs_counters.queue_full_drop_cnt;
+	hdev->aggregated_cs_counters.max_cs_in_flight_drop_cnt +=
+			ctx->cs_counters.max_cs_in_flight_drop_cnt;
 }
 
 static void cs_do_release(struct kref *ref)
@@ -336,7 +334,7 @@ static void cs_do_release(struct kref *ref)
 		 * In case the wait for signal CS was submitted, the put occurs
 		 * in init_signal_wait_cs() right before hanging on the PQ.
 		 */
-		dma_fence_put(cs->signal_fence);
+		hl_fence_put(cs->signal_fence);
 	}
 
 	/*
@@ -348,19 +346,18 @@ static void cs_do_release(struct kref *ref)
 	hl_ctx_put(cs->ctx);
 
 	/* We need to mark an error for not submitted because in that case
-	 * the dma fence release flow is different. Mainly, we don't need
+	 * the hl fence release flow is different. Mainly, we don't need
 	 * to handle hw_sob for signal/wait
 	 */
 	if (cs->timedout)
-		dma_fence_set_error(cs->fence, -ETIMEDOUT);
+		cs->fence->error = -ETIMEDOUT;
 	else if (cs->aborted)
-		dma_fence_set_error(cs->fence, -EIO);
+		cs->fence->error = -EIO;
 	else if (!cs->submitted)
-		dma_fence_set_error(cs->fence, -EBUSY);
-
-	dma_fence_signal(cs->fence);
-	dma_fence_put(cs->fence);
+		cs->fence->error = -EBUSY;
 
+	complete_all(&cs->fence->completion);
+	hl_fence_put(cs->fence);
 	cs_counters_aggregate(hdev, cs->ctx);
 
 	kfree(cs->jobs_in_queue_cnt);
@@ -401,7 +398,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 			enum hl_cs_type cs_type, struct hl_cs **cs_new)
 {
 	struct hl_cs_compl *cs_cmpl;
-	struct dma_fence *other = NULL;
+	struct hl_fence *other = NULL;
 	struct hl_cs *cs;
 	int rc;
 
@@ -434,9 +431,11 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 	cs_cmpl->cs_seq = ctx->cs_sequence;
 	other = ctx->cs_pending[cs_cmpl->cs_seq &
 				(hdev->asic_prop.max_pending_cs - 1)];
-	if ((other) && (!dma_fence_is_signaled(other))) {
-		dev_dbg(hdev->dev,
+
+	if (other && !completion_done(&other->completion)) {
+		dev_dbg_ratelimited(hdev->dev,
 			"Rejecting CS because of too many in-flights CS\n");
+		ctx->cs_counters.max_cs_in_flight_drop_cnt++;
 		rc = -EAGAIN;
 		goto free_fence;
 	}
@@ -448,8 +447,8 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 		goto free_fence;
 	}
 
-	dma_fence_init(&cs_cmpl->base_fence, &hl_fence_ops, &cs_cmpl->lock,
-			ctx->asid, ctx->cs_sequence);
+	/* init hl_fence */
+	hl_fence_init(&cs_cmpl->base_fence);
 
 	cs->sequence = cs_cmpl->cs_seq;
 
@@ -458,9 +457,9 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 							&cs_cmpl->base_fence;
 	ctx->cs_sequence++;
 
-	dma_fence_get(&cs_cmpl->base_fence);
+	hl_fence_get(&cs_cmpl->base_fence);
 
-	dma_fence_put(other);
+	hl_fence_put(other);
 
 	spin_unlock(&ctx->cs_lock);
 
@@ -690,8 +689,8 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
 			rc = -ENOMEM;
 			if (is_kernel_allocated_cb)
 				goto release_cb;
-			else
-				goto free_cs_object;
+
+			goto free_cs_object;
 		}
 
 		job->id = i + 1;
@@ -773,7 +772,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 	struct hl_ctx *ctx = hpriv->ctx;
 	struct hl_cs_chunk *cs_chunk_array, *chunk;
 	struct hw_queue_properties *hw_queue_prop;
-	struct dma_fence *sig_fence = NULL;
+	struct hl_fence *sig_fence = NULL;
 	struct hl_cs_job *job;
 	struct hl_cs *cs;
 	struct hl_cb *cb;
@@ -808,6 +807,14 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 
 	/* currently it is guaranteed to have only one chunk */
 	chunk = &cs_chunk_array[0];
+
+	if (chunk->queue_index >= hdev->asic_prop.max_queues) {
+		dev_err(hdev->dev, "Queue index %d is invalid\n",
+			chunk->queue_index);
+		rc = -EINVAL;
+		goto free_cs_chunk_array;
+	}
+
 	q_idx = chunk->queue_index;
 	hw_queue_prop = &hdev->asic_prop.hw_queues_props[q_idx];
 	q_type = hw_queue_prop->type;
@@ -875,14 +882,14 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 			dev_err(hdev->dev,
 				"CS seq 0x%llx is not of a signal CS\n",
 				signal_seq);
-			dma_fence_put(sig_fence);
+			hl_fence_put(sig_fence);
 			rc = -EINVAL;
 			goto free_signal_seq_array;
 		}
 
-		if (dma_fence_is_signaled(sig_fence)) {
+		if (completion_done(&sig_fence->completion)) {
 			/* signal CS already finished */
-			dma_fence_put(sig_fence);
+			hl_fence_put(sig_fence);
 			rc = 0;
 			goto free_signal_seq_array;
 		}
@@ -894,7 +901,7 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 	rc = allocate_cs(hdev, ctx, cs_type, &cs);
 	if (rc) {
 		if (cs_type == CS_TYPE_WAIT)
-			dma_fence_put(sig_fence);
+			hl_fence_put(sig_fence);
 		hl_ctx_put(ctx);
 		goto free_signal_seq_array;
 	}
@@ -1154,7 +1161,7 @@ out:
 static long _hl_cs_wait_ioctl(struct hl_device *hdev,
 		struct hl_ctx *ctx, u64 timeout_us, u64 seq)
 {
-	struct dma_fence *fence;
+	struct hl_fence *fence;
 	unsigned long timeout;
 	long rc;
 
@@ -1173,12 +1180,18 @@ static long _hl_cs_wait_ioctl(struct hl_device *hdev,
 				"Can't wait on CS %llu because current CS is at seq %llu\n",
 				seq, ctx->cs_sequence);
 	} else if (fence) {
-		rc = dma_fence_wait_timeout(fence, true, timeout);
+		if (!timeout_us)
+			rc = completion_done(&fence->completion);
+		else
+			rc = wait_for_completion_interruptible_timeout(
+					&fence->completion, timeout);
+
 		if (fence->error == -ETIMEDOUT)
 			rc = -ETIMEDOUT;
 		else if (fence->error == -EIO)
 			rc = -EIO;
-		dma_fence_put(fence);
+
+		hl_fence_put(fence);
 	} else {
 		dev_dbg(hdev->dev,
 			"Can't wait on seq %llu because current CS is at seq %llu (Fence is gone)\n",
diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c
index 3e375958e73b..7a59dd7c6450 100644
--- a/drivers/misc/habanalabs/common/context.c
+++ b/drivers/misc/habanalabs/common/context.c
@@ -12,6 +12,7 @@
 static void hl_ctx_fini(struct hl_ctx *ctx)
 {
 	struct hl_device *hdev = ctx->hdev;
+	u64 idle_mask = 0;
 	int i;
 
 	/*
@@ -23,11 +24,13 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
 	 */
 
 	for (i = 0 ; i < hdev->asic_prop.max_pending_cs ; i++)
-		dma_fence_put(ctx->cs_pending[i]);
+		hl_fence_put(ctx->cs_pending[i]);
 
 	kfree(ctx->cs_pending);
 
 	if (ctx->asid != HL_KERNEL_ASID_ID) {
+		dev_dbg(hdev->dev, "closing user context %d\n", ctx->asid);
+
 		/* The engines are stopped as there is no executing CS, but the
 		 * Coresight might be still working by accessing addresses
 		 * related to the stopped engines. Hence stop it explicitly.
@@ -37,9 +40,18 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
 		if ((hdev->in_debug) && (hdev->compute_ctx == ctx))
 			hl_device_set_debug_mode(hdev, false);
 
+		hl_cb_va_pool_fini(ctx);
 		hl_vm_ctx_fini(ctx);
 		hl_asid_free(hdev, ctx->asid);
+
+		if ((!hdev->pldm) && (hdev->pdev) &&
+				(!hdev->asic_funcs->is_device_idle(hdev,
+							&idle_mask, NULL)))
+			dev_notice(hdev->dev,
+				"device not idle after user context is closed (0x%llx)\n",
+				idle_mask);
 	} else {
+		dev_dbg(hdev->dev, "closing kernel context\n");
 		hl_mmu_ctx_fini(ctx);
 	}
 }
@@ -128,7 +140,7 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
 	atomic_set(&ctx->thread_ctx_switch_token, 1);
 	ctx->thread_ctx_switch_wait_token = 0;
 	ctx->cs_pending = kcalloc(hdev->asic_prop.max_pending_cs,
-				sizeof(struct dma_fence *),
+				sizeof(struct hl_fence *),
 				GFP_KERNEL);
 	if (!ctx->cs_pending)
 		return -ENOMEM;
@@ -155,15 +167,26 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
 			goto err_asid_free;
 		}
 
+		rc = hl_cb_va_pool_init(ctx);
+		if (rc) {
+			dev_err(hdev->dev,
+				"Failed to init VA pool for mapped CB\n");
+			goto err_vm_ctx_fini;
+		}
+
 		rc = hdev->asic_funcs->ctx_init(ctx);
 		if (rc) {
 			dev_err(hdev->dev, "ctx_init failed\n");
-			goto err_vm_ctx_fini;
+			goto err_cb_va_pool_fini;
 		}
+
+		dev_dbg(hdev->dev, "create user context %d\n", ctx->asid);
 	}
 
 	return 0;
 
+err_cb_va_pool_fini:
+	hl_cb_va_pool_fini(ctx);
 err_vm_ctx_fini:
 	hl_vm_ctx_fini(ctx);
 err_asid_free:
@@ -184,10 +207,10 @@ int hl_ctx_put(struct hl_ctx *ctx)
 	return kref_put(&ctx->refcount, hl_ctx_do_release);
 }
 
-struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
+struct hl_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
 {
 	struct asic_fixed_properties *asic_prop = &ctx->hdev->asic_prop;
-	struct dma_fence *fence;
+	struct hl_fence *fence;
 
 	spin_lock(&ctx->cs_lock);
 
@@ -201,8 +224,9 @@ struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq)
 		return NULL;
 	}
 
-	fence = dma_fence_get(
-			ctx->cs_pending[seq & (asic_prop->max_pending_cs - 1)]);
+	fence = ctx->cs_pending[seq & (asic_prop->max_pending_cs - 1)];
+	hl_fence_get(fence);
+
 	spin_unlock(&ctx->cs_lock);
 
 	return fence;
diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c
index c50c6fc9e905..912ddfa360b1 100644
--- a/drivers/misc/habanalabs/common/debugfs.c
+++ b/drivers/misc/habanalabs/common/debugfs.c
@@ -19,9 +19,9 @@
 static struct dentry *hl_debug_root;
 
 static int hl_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
-				u8 i2c_reg, u32 *val)
+				u8 i2c_reg, long *val)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;
 
 	if (hl_device_disabled_or_in_reset(hdev))
@@ -29,14 +29,14 @@ static int hl_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
 
 	memset(&pkt, 0, sizeof(pkt));
 
-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_I2C_RD <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_I2C_RD <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.i2c_bus = i2c_bus;
 	pkt.i2c_addr = i2c_addr;
 	pkt.i2c_reg = i2c_reg;
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-						0, (long *) val);
+						0, val);
 
 	if (rc)
 		dev_err(hdev->dev, "Failed to read from I2C, error %d\n", rc);
@@ -47,7 +47,7 @@ static int hl_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
 static int hl_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
 				u8 i2c_reg, u32 val)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;
 
 	if (hl_device_disabled_or_in_reset(hdev))
@@ -55,8 +55,8 @@ static int hl_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
 
 	memset(&pkt, 0, sizeof(pkt));
 
-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_I2C_WR <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_I2C_WR <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.i2c_bus = i2c_bus;
 	pkt.i2c_addr = i2c_addr;
 	pkt.i2c_reg = i2c_reg;
@@ -73,7 +73,7 @@ static int hl_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
 
 static void hl_debugfs_led_set(struct hl_device *hdev, u8 led, u8 state)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;
 
 	if (hl_device_disabled_or_in_reset(hdev))
@@ -81,8 +81,8 @@ static void hl_debugfs_led_set(struct hl_device *hdev, u8 led, u8 state)
 
 	memset(&pkt, 0, sizeof(pkt));
 
-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_LED_SET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_LED_SET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.led_index = cpu_to_le32(led);
 	pkt.value = cpu_to_le64(state);
 
@@ -110,8 +110,8 @@ static int command_buffers_show(struct seq_file *s, void *data)
 			seq_puts(s, "---------------------------------------------------------------\n");
 		}
 		seq_printf(s,
-			"   %03d        %d    0x%08x      %d          %d          %d\n",
-			cb->id, cb->ctx_id, cb->size,
+			"   %03llu        %d    0x%08x      %d          %d          %d\n",
+			cb->id, cb->ctx->asid, cb->size,
 			kref_read(&cb->refcount),
 			cb->mmap, cb->cs_cnt);
 	}
@@ -354,6 +354,14 @@ static inline u64 get_hop4_pte_addr(struct hl_ctx *ctx,
 					mmu_specs->hop4_shift);
 }
 
+static inline u64 get_hop5_pte_addr(struct hl_ctx *ctx,
+					struct hl_mmu_properties *mmu_specs,
+					u64 hop_addr, u64 vaddr)
+{
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_specs->hop5_mask,
+					mmu_specs->hop5_shift);
+}
+
 static inline u64 get_next_hop_addr(u64 curr_pte)
 {
 	if (curr_pte & PAGE_PRESENT_MASK)
@@ -377,6 +385,7 @@ static int mmu_show(struct seq_file *s, void *data)
 		hop2_addr = 0, hop2_pte_addr = 0, hop2_pte = 0,
 		hop3_addr = 0, hop3_pte_addr = 0, hop3_pte = 0,
 		hop4_addr = 0, hop4_pte_addr = 0, hop4_pte = 0,
+		hop5_addr = 0, hop5_pte_addr = 0, hop5_pte = 0,
 		virt_addr = dev_entry->mmu_addr;
 
 	if (!hdev->mmu_enable)
@@ -428,20 +437,49 @@ static int mmu_show(struct seq_file *s, void *data)
 	hop3_pte_addr = get_hop3_pte_addr(ctx, mmu_prop, hop3_addr, virt_addr);
 	hop3_pte = hdev->asic_funcs->read_pte(hdev, hop3_pte_addr);
 
-	if (!(hop3_pte & LAST_MASK)) {
+	if (mmu_prop->num_hops == MMU_ARCH_5_HOPS) {
+		if (!(hop3_pte & LAST_MASK)) {
+			hop4_addr = get_next_hop_addr(hop3_pte);
+
+			if (hop4_addr == ULLONG_MAX)
+				goto not_mapped;
+
+			hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop,
+							hop4_addr, virt_addr);
+			hop4_pte = hdev->asic_funcs->read_pte(hdev,
+								hop4_pte_addr);
+			if (!(hop4_pte & PAGE_PRESENT_MASK))
+				goto not_mapped;
+		} else {
+			if (!(hop3_pte & PAGE_PRESENT_MASK))
+				goto not_mapped;
+		}
+	} else {
 		hop4_addr = get_next_hop_addr(hop3_pte);
 
 		if (hop4_addr == ULLONG_MAX)
 			goto not_mapped;
 
-		hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop, hop4_addr,
-							virt_addr);
-		hop4_pte = hdev->asic_funcs->read_pte(hdev, hop4_pte_addr);
-		if (!(hop4_pte & PAGE_PRESENT_MASK))
-			goto not_mapped;
-	} else {
-		if (!(hop3_pte & PAGE_PRESENT_MASK))
-			goto not_mapped;
+		hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop,
+						hop4_addr, virt_addr);
+		hop4_pte = hdev->asic_funcs->read_pte(hdev,
+							hop4_pte_addr);
+		if (!(hop4_pte & LAST_MASK)) {
+			hop5_addr = get_next_hop_addr(hop4_pte);
+
+			if (hop5_addr == ULLONG_MAX)
+				goto not_mapped;
+
+			hop5_pte_addr = get_hop5_pte_addr(ctx, mmu_prop,
+							hop5_addr, virt_addr);
+			hop5_pte = hdev->asic_funcs->read_pte(hdev,
+								hop5_pte_addr);
+			if (!(hop5_pte & PAGE_PRESENT_MASK))
+				goto not_mapped;
+		} else {
+			if (!(hop4_pte & PAGE_PRESENT_MASK))
+				goto not_mapped;
+		}
 	}
 
 	seq_printf(s, "asid: %u, virt_addr: 0x%llx\n",
@@ -463,10 +501,22 @@ static int mmu_show(struct seq_file *s, void *data)
 	seq_printf(s, "hop3_pte_addr: 0x%llx\n", hop3_pte_addr);
 	seq_printf(s, "hop3_pte: 0x%llx\n", hop3_pte);
 
-	if (!(hop3_pte & LAST_MASK)) {
+	if (mmu_prop->num_hops == MMU_ARCH_5_HOPS) {
+		if (!(hop3_pte & LAST_MASK)) {
+			seq_printf(s, "hop4_addr: 0x%llx\n", hop4_addr);
+			seq_printf(s, "hop4_pte_addr: 0x%llx\n", hop4_pte_addr);
+			seq_printf(s, "hop4_pte: 0x%llx\n", hop4_pte);
+		}
+	} else {
 		seq_printf(s, "hop4_addr: 0x%llx\n", hop4_addr);
 		seq_printf(s, "hop4_pte_addr: 0x%llx\n", hop4_pte_addr);
 		seq_printf(s, "hop4_pte: 0x%llx\n", hop4_pte);
+
+		if (!(hop4_pte & LAST_MASK)) {
+			seq_printf(s, "hop5_addr: 0x%llx\n", hop5_addr);
+			seq_printf(s, "hop5_pte_addr: 0x%llx\n", hop5_pte_addr);
+			seq_printf(s, "hop5_pte: 0x%llx\n", hop5_pte);
+		}
 	}
 
 	goto out;
@@ -827,7 +877,7 @@ static ssize_t hl_i2c_data_read(struct file *f, char __user *buf,
 	struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
 	struct hl_device *hdev = entry->hdev;
 	char tmp_buf[32];
-	u32 val;
+	long val;
 	ssize_t rc;
 
 	if (*ppos)
@@ -842,7 +892,7 @@ static ssize_t hl_i2c_data_read(struct file *f, char __user *buf,
 		return rc;
 	}
 
-	sprintf(tmp_buf, "0x%02x\n", val);
+	sprintf(tmp_buf, "0x%02lx\n", val);
 	rc = simple_read_from_buffer(buf, count, ppos, tmp_buf,
 			strlen(tmp_buf));
 
@@ -982,7 +1032,7 @@ static ssize_t hl_clk_gate_read(struct file *f, char __user *buf,
 		return 0;
 
 	sprintf(tmp_buf, "0x%llx\n", hdev->clock_gating_mask);
-	rc = simple_read_from_buffer(buf, strlen(tmp_buf) + 1, ppos, tmp_buf,
+	rc = simple_read_from_buffer(buf, count, ppos, tmp_buf,
 			strlen(tmp_buf) + 1);
 
 	return rc;
diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index be16b75bdfdb..20572224099a 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -123,9 +123,13 @@ static int hl_device_release_ctrl(struct inode *inode, struct file *filp)
 static int hl_mmap(struct file *filp, struct vm_area_struct *vma)
 {
 	struct hl_fpriv *hpriv = filp->private_data;
+	unsigned long vm_pgoff;
 
-	if ((vma->vm_pgoff & HL_MMAP_CB_MASK) == HL_MMAP_CB_MASK) {
-		vma->vm_pgoff ^= HL_MMAP_CB_MASK;
+	vm_pgoff = vma->vm_pgoff;
+	vma->vm_pgoff = HL_MMAP_OFFSET_VALUE_GET(vm_pgoff);
+
+	switch (vm_pgoff & HL_MMAP_TYPE_MASK) {
+	case HL_MMAP_TYPE_CB:
 		return hl_cb_mmap(hpriv, vma);
 	}
 
@@ -286,9 +290,9 @@ static int device_early_init(struct hl_device *hdev)
 	}
 
 	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++) {
-		snprintf(workq_name, 32, "hl-free-jobs-%u", i);
+		snprintf(workq_name, 32, "hl-free-jobs-%u", (u32) i);
 		hdev->cq_wq[i] = create_singlethread_workqueue(workq_name);
-		if (hdev->cq_wq == NULL) {
+		if (hdev->cq_wq[i] == NULL) {
 			dev_err(hdev->dev, "Failed to allocate CQ workqueue\n");
 			rc = -ENOMEM;
 			goto free_cq_wq;
@@ -317,6 +321,10 @@ static int device_early_init(struct hl_device *hdev)
 		goto free_chip_info;
 	}
 
+	rc = hl_mmu_if_set_funcs(hdev);
+	if (rc)
+		goto free_idle_busy_ts_arr;
+
 	hl_cb_mgr_init(&hdev->kernel_cb_mgr);
 
 	mutex_init(&hdev->send_cpu_message_lock);
@@ -330,6 +338,8 @@ static int device_early_init(struct hl_device *hdev)
 
 	return 0;
 
+free_idle_busy_ts_arr:
+	kfree(hdev->idle_busy_ts_arr);
 free_chip_info:
 	kfree(hdev->hl_chip_info);
 free_eq_wq:
@@ -871,7 +881,7 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
 			 * so this message won't be sent
 			 */
 			if (hl_fw_send_pci_access_msg(hdev,
-					ARMCP_PACKET_DISABLE_PCI_ACCESS))
+					CPUCP_PACKET_DISABLE_PCI_ACCESS))
 				dev_warn(hdev->dev,
 					"Failed to disable PCI access by F/W\n");
 		}
@@ -957,14 +967,13 @@ again:
 		flush_workqueue(hdev->eq_wq);
 	}
 
-	/* Release kernel context */
-	if ((hard_reset) && (hl_ctx_put(hdev->kernel_ctx) == 1))
-		hdev->kernel_ctx = NULL;
-
 	/* Reset the H/W. It will be in idle state after this returns */
 	hdev->asic_funcs->hw_fini(hdev, hard_reset);
 
 	if (hard_reset) {
+		/* Release kernel context */
+		if (hl_ctx_put(hdev->kernel_ctx) == 1)
+			hdev->kernel_ctx = NULL;
 		hl_vm_fini(hdev);
 		hl_mmu_fini(hdev);
 		hl_eq_reset(hdev, &hdev->event_queue);
@@ -1069,7 +1078,7 @@ again:
 			goto out_err;
 		}
 
-		hl_set_max_power(hdev, hdev->max_power);
+		hl_set_max_power(hdev);
 	} else {
 		rc = hdev->asic_funcs->soft_reset_late_init(hdev);
 		if (rc) {
@@ -1318,6 +1327,11 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 		goto out_disabled;
 	}
 
+	/* Need to call this again because the max power might change,
+	 * depending on card type for certain ASICs
+	 */
+	hl_set_max_power(hdev);
+
 	/*
 	 * hl_hwmon_init() must be called after device_late_init(), because only
 	 * there we get the information from the device about which
@@ -1450,13 +1464,13 @@ void hl_device_fini(struct hl_device *hdev)
 
 	hl_cb_pool_fini(hdev);
 
+	/* Reset the H/W. It will be in idle state after this returns */
+	hdev->asic_funcs->hw_fini(hdev, true);
+
 	/* Release kernel context */
 	if ((hdev->kernel_ctx) && (hl_ctx_put(hdev->kernel_ctx) != 1))
 		dev_err(hdev->dev, "kernel ctx is still alive\n");
 
-	/* Reset the H/W. It will be in idle state after this returns */
-	hdev->asic_funcs->hw_fini(hdev, true);
-
 	hl_vm_fini(hdev);
 
 	hl_mmu_fini(hdev);
diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index f70302cdab1b..cd41c7ceb0e7 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -13,6 +13,7 @@
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/slab.h>
 
+#define FW_FILE_MAX_SIZE	0x1400000 /* maximum size of 20MB */
 /**
  * hl_fw_load_fw_to_device() - Load F/W code to device's memory.
  *
@@ -48,6 +49,14 @@ int hl_fw_load_fw_to_device(struct hl_device *hdev, const char *fw_name,
 
 	dev_dbg(hdev->dev, "%s firmware size == %zu\n", fw_name, fw_size);
 
+	if (fw_size > FW_FILE_MAX_SIZE) {
+		dev_err(hdev->dev,
+			"FW file size %zu exceeds maximum of %u bytes\n",
+			fw_size, FW_FILE_MAX_SIZE);
+		rc = -EINVAL;
+		goto out;
+	}
+
 	fw_data = (const u64 *) fw->data;
 
 	memcpy_toio(dst, fw_data, fw_size);
@@ -59,9 +68,9 @@ out:
 
 int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode)
 {
-	struct armcp_packet pkt = {};
+	struct cpucp_packet pkt = {};
 
-	pkt.ctl = cpu_to_le32(opcode << ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(opcode << CPUCP_PKT_CTL_OPCODE_SHIFT);
 
 	return hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt,
 						sizeof(pkt), 0, NULL);
@@ -70,7 +79,7 @@ int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode)
 int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
 				u16 len, u32 timeout, long *result)
 {
-	struct armcp_packet *pkt;
+	struct cpucp_packet *pkt;
 	dma_addr_t pkt_dma_addr;
 	u32 tmp;
 	int rc = 0;
@@ -102,7 +111,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
 	}
 
 	rc = hl_poll_timeout_memory(hdev, &pkt->fence, tmp,
-				(tmp == ARMCP_PACKET_FENCE_VAL), 1000,
+				(tmp == CPUCP_PACKET_FENCE_VAL), 1000,
 				timeout, true);
 
 	hl_hw_queue_inc_ci_kernel(hdev, hw_queue_id);
@@ -115,12 +124,12 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
 
 	tmp = le32_to_cpu(pkt->ctl);
 
-	rc = (tmp & ARMCP_PKT_CTL_RC_MASK) >> ARMCP_PKT_CTL_RC_SHIFT;
+	rc = (tmp & CPUCP_PKT_CTL_RC_MASK) >> CPUCP_PKT_CTL_RC_SHIFT;
 	if (rc) {
 		dev_err(hdev->dev, "F/W ERROR %d for CPU packet %d\n",
 			rc,
-			(tmp & ARMCP_PKT_CTL_OPCODE_MASK)
-						>> ARMCP_PKT_CTL_OPCODE_SHIFT);
+			(tmp & CPUCP_PKT_CTL_OPCODE_MASK)
+						>> CPUCP_PKT_CTL_OPCODE_SHIFT);
 		rc = -EIO;
 	} else if (result) {
 		*result = (long) le64_to_cpu(pkt->result);
@@ -136,14 +145,14 @@ out:
 
 int hl_fw_unmask_irq(struct hl_device *hdev, u16 event_type)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	long result;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
 
-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_UNMASK_RAZWI_IRQ <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_UNMASK_RAZWI_IRQ <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.value = cpu_to_le64(event_type);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
@@ -158,15 +167,15 @@ int hl_fw_unmask_irq(struct hl_device *hdev, u16 event_type)
 int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr,
 		size_t irq_arr_size)
 {
-	struct armcp_unmask_irq_arr_packet *pkt;
+	struct cpucp_unmask_irq_arr_packet *pkt;
 	size_t total_pkt_size;
 	long result;
 	int rc;
 
-	total_pkt_size = sizeof(struct armcp_unmask_irq_arr_packet) +
+	total_pkt_size = sizeof(struct cpucp_unmask_irq_arr_packet) +
 			irq_arr_size;
 
-	/* data should be aligned to 8 bytes in order to ArmCP to copy it */
+	/* data should be aligned to 8 bytes in order to CPU-CP to copy it */
 	total_pkt_size = (total_pkt_size + 0x7) & ~0x7;
 
 	/* total_pkt_size is casted to u16 later on */
@@ -182,8 +191,8 @@ int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr,
 	pkt->length = cpu_to_le32(irq_arr_size / sizeof(irq_arr[0]));
 	memcpy(&pkt->irqs, irq_arr, irq_arr_size);
 
-	pkt->armcp_pkt.ctl = cpu_to_le32(ARMCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY <<
-						ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt->cpucp_pkt.ctl = cpu_to_le32(CPUCP_PACKET_UNMASK_RAZWI_IRQ_ARRAY <<
+						CPUCP_PKT_CTL_OPCODE_SHIFT);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) pkt,
 						total_pkt_size, 0, &result);
@@ -198,19 +207,19 @@ int hl_fw_unmask_irq_arr(struct hl_device *hdev, const u32 *irq_arr,
 
 int hl_fw_test_cpu_queue(struct hl_device *hdev)
 {
-	struct armcp_packet test_pkt = {};
+	struct cpucp_packet test_pkt = {};
 	long result;
 	int rc;
 
-	test_pkt.ctl = cpu_to_le32(ARMCP_PACKET_TEST <<
-					ARMCP_PKT_CTL_OPCODE_SHIFT);
-	test_pkt.value = cpu_to_le64(ARMCP_PACKET_FENCE_VAL);
+	test_pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEST <<
+					CPUCP_PKT_CTL_OPCODE_SHIFT);
+	test_pkt.value = cpu_to_le64(CPUCP_PACKET_FENCE_VAL);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &test_pkt,
 						sizeof(test_pkt), 0, &result);
 
 	if (!rc) {
-		if (result != ARMCP_PACKET_FENCE_VAL)
+		if (result != CPUCP_PACKET_FENCE_VAL)
 			dev_err(hdev->dev,
 				"CPU queue test failed (0x%08lX)\n", result);
 	} else {
@@ -242,61 +251,61 @@ void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
 
 int hl_fw_send_heartbeat(struct hl_device *hdev)
 {
-	struct armcp_packet hb_pkt = {};
+	struct cpucp_packet hb_pkt = {};
 	long result;
 	int rc;
 
-	hb_pkt.ctl = cpu_to_le32(ARMCP_PACKET_TEST <<
-					ARMCP_PKT_CTL_OPCODE_SHIFT);
-	hb_pkt.value = cpu_to_le64(ARMCP_PACKET_FENCE_VAL);
+	hb_pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEST <<
+					CPUCP_PKT_CTL_OPCODE_SHIFT);
+	hb_pkt.value = cpu_to_le64(CPUCP_PACKET_FENCE_VAL);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &hb_pkt,
 						sizeof(hb_pkt), 0, &result);
 
-	if ((rc) || (result != ARMCP_PACKET_FENCE_VAL))
+	if ((rc) || (result != CPUCP_PACKET_FENCE_VAL))
 		rc = -EIO;
 
 	return rc;
 }
 
-int hl_fw_armcp_info_get(struct hl_device *hdev)
+int hl_fw_cpucp_info_get(struct hl_device *hdev)
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
-	struct armcp_packet pkt = {};
-	void *armcp_info_cpu_addr;
-	dma_addr_t armcp_info_dma_addr;
+	struct cpucp_packet pkt = {};
+	void *cpucp_info_cpu_addr;
+	dma_addr_t cpucp_info_dma_addr;
 	long result;
 	int rc;
 
-	armcp_info_cpu_addr =
+	cpucp_info_cpu_addr =
 			hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev,
-					sizeof(struct armcp_info),
-					&armcp_info_dma_addr);
-	if (!armcp_info_cpu_addr) {
+					sizeof(struct cpucp_info),
+					&cpucp_info_dma_addr);
+	if (!cpucp_info_cpu_addr) {
 		dev_err(hdev->dev,
-			"Failed to allocate DMA memory for ArmCP info packet\n");
+			"Failed to allocate DMA memory for CPU-CP info packet\n");
 		return -ENOMEM;
 	}
 
-	memset(armcp_info_cpu_addr, 0, sizeof(struct armcp_info));
+	memset(cpucp_info_cpu_addr, 0, sizeof(struct cpucp_info));
 
-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_INFO_GET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
-	pkt.addr = cpu_to_le64(armcp_info_dma_addr);
-	pkt.data_max_size = cpu_to_le32(sizeof(struct armcp_info));
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_INFO_GET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.addr = cpu_to_le64(cpucp_info_dma_addr);
+	pkt.data_max_size = cpu_to_le32(sizeof(struct cpucp_info));
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-					HL_ARMCP_INFO_TIMEOUT_USEC, &result);
+					HL_CPUCP_INFO_TIMEOUT_USEC, &result);
 	if (rc) {
 		dev_err(hdev->dev,
-			"Failed to handle ArmCP info pkt, error %d\n", rc);
+			"Failed to handle CPU-CP info pkt, error %d\n", rc);
 		goto out;
 	}
 
-	memcpy(&prop->armcp_info, armcp_info_cpu_addr,
-			sizeof(prop->armcp_info));
+	memcpy(&prop->cpucp_info, cpucp_info_cpu_addr,
+			sizeof(prop->cpucp_info));
 
-	rc = hl_build_hwmon_channel_info(hdev, prop->armcp_info.sensors);
+	rc = hl_build_hwmon_channel_info(hdev, prop->cpucp_info.sensors);
 	if (rc) {
 		dev_err(hdev->dev,
 			"Failed to build hwmon channel info, error %d\n", rc);
@@ -306,14 +315,14 @@ int hl_fw_armcp_info_get(struct hl_device *hdev)
 
 out:
 	hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev,
-			sizeof(struct armcp_info), armcp_info_cpu_addr);
+			sizeof(struct cpucp_info), cpucp_info_cpu_addr);
 
 	return rc;
 }
 
 int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size)
 {
-	struct armcp_packet pkt = {};
+	struct cpucp_packet pkt = {};
 	void *eeprom_info_cpu_addr;
 	dma_addr_t eeprom_info_dma_addr;
 	long result;
@@ -324,23 +333,24 @@ int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size)
 					max_size, &eeprom_info_dma_addr);
 	if (!eeprom_info_cpu_addr) {
 		dev_err(hdev->dev,
-			"Failed to allocate DMA memory for ArmCP EEPROM packet\n");
+			"Failed to allocate DMA memory for CPU-CP EEPROM packet\n");
 		return -ENOMEM;
 	}
 
 	memset(eeprom_info_cpu_addr, 0, max_size);
 
-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_EEPROM_DATA_GET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_EEPROM_DATA_GET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.addr = cpu_to_le64(eeprom_info_dma_addr);
 	pkt.data_max_size = cpu_to_le32(max_size);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-			HL_ARMCP_EEPROM_TIMEOUT_USEC, &result);
+			HL_CPUCP_EEPROM_TIMEOUT_USEC, &result);
 
 	if (rc) {
 		dev_err(hdev->dev,
-			"Failed to handle ArmCP EEPROM packet, error %d\n", rc);
+			"Failed to handle CPU-CP EEPROM packet, error %d\n",
+			rc);
 		goto out;
 	}
 
@@ -354,6 +364,77 @@ out:
 	return rc;
 }
 
+int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev,
+		struct hl_info_pci_counters *counters)
+{
+	struct cpucp_packet pkt = {};
+	long result;
+	int rc;
+
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_PCIE_THROUGHPUT_GET <<
+			CPUCP_PKT_CTL_OPCODE_SHIFT);
+
+	/* Fetch PCI rx counter */
+	pkt.index = cpu_to_le32(cpucp_pcie_throughput_rx);
+	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
+					HL_CPUCP_INFO_TIMEOUT_USEC, &result);
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
+		return rc;
+	}
+	counters->rx_throughput = result;
+
+	/* Fetch PCI tx counter */
+	pkt.index = cpu_to_le32(cpucp_pcie_throughput_tx);
+	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
+					HL_CPUCP_INFO_TIMEOUT_USEC, &result);
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
+		return rc;
+	}
+	counters->tx_throughput = result;
+
+	/* Fetch PCI replay counter */
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_PCIE_REPLAY_CNT_GET <<
+			CPUCP_PKT_CTL_OPCODE_SHIFT);
+
+	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
+			HL_CPUCP_INFO_TIMEOUT_USEC, &result);
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed to handle CPU-CP PCI info pkt, error %d\n", rc);
+		return rc;
+	}
+	counters->replay_cnt = (u32) result;
+
+	return rc;
+}
+
+int hl_fw_cpucp_total_energy_get(struct hl_device *hdev, u64 *total_energy)
+{
+	struct cpucp_packet pkt = {};
+	long result;
+	int rc;
+
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_TOTAL_ENERGY_GET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
+
+	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
+					HL_CPUCP_INFO_TIMEOUT_USEC, &result);
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed to handle CpuCP total energy pkt, error %d\n",
+				rc);
+		return rc;
+	}
+
+	*total_energy = result;
+
+	return rc;
+}
+
 static void fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg)
 {
 	u32 err_val;
@@ -393,8 +474,11 @@ static void fw_read_errors(struct hl_device *hdev, u32 boot_err0_reg)
 			"Device boot error - NIC F/W initialization failed\n");
 }
 
-static void hl_detect_cpu_boot_status(struct hl_device *hdev, u32 status)
+static void detect_cpu_boot_status(struct hl_device *hdev, u32 status)
 {
+	/* Some of the status codes below are deprecated in newer f/w
+	 * versions but we keep them here for backward compatibility
+	 */
 	switch (status) {
 	case CPU_BOOT_STATUS_NA:
 		dev_err(hdev->dev,
@@ -440,6 +524,48 @@ static void hl_detect_cpu_boot_status(struct hl_device *hdev, u32 status)
 	}
 }
 
+int hl_fw_read_preboot_ver(struct hl_device *hdev, u32 cpu_boot_status_reg,
+				u32 boot_err0_reg, u32 timeout)
+{
+	u32 status;
+	int rc;
+
+	if (!hdev->cpu_enable)
+		return 0;
+
+	/* Need to check two possible scenarios:
+	 *
+	 * CPU_BOOT_STATUS_WAITING_FOR_BOOT_FIT - for newer firmwares where
+	 * the preboot is waiting for the boot fit
+	 *
+	 * All other status values - for older firmwares where the uboot was
+	 * loaded from the FLASH
+	 */
+	rc = hl_poll_timeout(
+		hdev,
+		cpu_boot_status_reg,
+		status,
+		(status == CPU_BOOT_STATUS_IN_UBOOT) ||
+		(status == CPU_BOOT_STATUS_DRAM_RDY) ||
+		(status == CPU_BOOT_STATUS_NIC_FW_RDY) ||
+		(status == CPU_BOOT_STATUS_READY_TO_BOOT) ||
+		(status == CPU_BOOT_STATUS_SRAM_AVAIL) ||
+		(status == CPU_BOOT_STATUS_WAITING_FOR_BOOT_FIT),
+		10000,
+		timeout);
+
+	if (rc) {
+		dev_err(hdev->dev, "Failed to read preboot version\n");
+		detect_cpu_boot_status(hdev, status);
+		fw_read_errors(hdev, boot_err0_reg);
+		return -EIO;
+	}
+
+	hdev->asic_funcs->read_device_fw_version(hdev, FW_COMP_PREBOOT);
+
+	return 0;
+}
+
 int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
 			u32 msg_to_cpu_reg, u32 cpu_msg_status_reg,
 			u32 boot_err0_reg, bool skip_bmc,
@@ -505,15 +631,11 @@ int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
 		10000,
 		cpu_timeout);
 
-	/* Read U-Boot, preboot versions now in case we will later fail */
+	/* Read U-Boot version now in case we will later fail */
 	hdev->asic_funcs->read_device_fw_version(hdev, FW_COMP_UBOOT);
-	hdev->asic_funcs->read_device_fw_version(hdev, FW_COMP_PREBOOT);
 
-	/* Some of the status codes below are deprecated in newer f/w
-	 * versions but we keep them here for backward compatibility
-	 */
 	if (rc) {
-		hl_detect_cpu_boot_status(hdev, status);
+		detect_cpu_boot_status(hdev, status);
 		rc = -EIO;
 		goto out;
 	}
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index 018d9d67e8e6..6ed974d2def0 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -8,21 +8,33 @@
 #ifndef HABANALABSP_H_
 #define HABANALABSP_H_
 
-#include "../include/common/armcp_if.h"
+#include "../include/common/cpucp_if.h"
 #include "../include/common/qman_if.h"
 #include <uapi/misc/habanalabs.h>
 
 #include <linux/cdev.h>
 #include <linux/iopoll.h>
 #include <linux/irqreturn.h>
-#include <linux/dma-fence.h>
 #include <linux/dma-direction.h>
 #include <linux/scatterlist.h>
 #include <linux/hashtable.h>
+#include <linux/bitfield.h>
 
 #define HL_NAME				"habanalabs"
 
-#define HL_MMAP_CB_MASK			(0x8000000000000000ull >> PAGE_SHIFT)
+/* Use upper bits of mmap offset to store habana driver specific information.
+ * bits[63:62] - Encode mmap type
+ * bits[45:0]  - mmap offset value
+ *
+ * NOTE: struct vm_area_struct.vm_pgoff uses offset in pages. Hence, these
+ *  defines are w.r.t to PAGE_SIZE
+ */
+#define HL_MMAP_TYPE_SHIFT		(62 - PAGE_SHIFT)
+#define HL_MMAP_TYPE_MASK		(0x3ull << HL_MMAP_TYPE_SHIFT)
+#define HL_MMAP_TYPE_CB			(0x2ull << HL_MMAP_TYPE_SHIFT)
+
+#define HL_MMAP_OFFSET_VALUE_MASK	(0x3FFFFFFFFFFFull >> PAGE_SHIFT)
+#define HL_MMAP_OFFSET_VALUE_GET(off)	(off & HL_MMAP_OFFSET_VALUE_MASK)
 
 #define HL_PENDING_RESET_PER_SEC	30
 
@@ -34,8 +46,8 @@
 
 #define HL_PLL_LOW_JOB_FREQ_USEC	5000000 /* 5 s */
 
-#define HL_ARMCP_INFO_TIMEOUT_USEC	10000000 /* 10s */
-#define HL_ARMCP_EEPROM_TIMEOUT_USEC	10000000 /* 10s */
+#define HL_CPUCP_INFO_TIMEOUT_USEC	10000000 /* 10s */
+#define HL_CPUCP_EEPROM_TIMEOUT_USEC	10000000 /* 10s */
 
 #define HL_PCI_ELBI_TIMEOUT_MSEC	10 /* 10ms */
 
@@ -66,6 +78,8 @@
 
 #define HL_PCI_NUM_BARS			6
 
+#define HL_MAX_DCORES			4
+
 /**
  * struct pgt_info - MMU hop page info.
  * @node: hash linked-list node for the pgts shadow hash of pgts.
@@ -222,12 +236,15 @@ enum hl_device_hw_state {
  * @hop2_shift: shift of hop 2 mask.
  * @hop3_shift: shift of hop 3 mask.
  * @hop4_shift: shift of hop 4 mask.
+ * @hop5_shift: shift of hop 5 mask.
  * @hop0_mask: mask to get the PTE address in hop 0.
  * @hop1_mask: mask to get the PTE address in hop 1.
  * @hop2_mask: mask to get the PTE address in hop 2.
  * @hop3_mask: mask to get the PTE address in hop 3.
  * @hop4_mask: mask to get the PTE address in hop 4.
+ * @hop5_mask: mask to get the PTE address in hop 5.
  * @page_size: default page size used to allocate memory.
+ * @num_hops: The amount of hops supported by the translation table.
  */
 struct hl_mmu_properties {
 	u64	start_addr;
@@ -237,18 +254,21 @@ struct hl_mmu_properties {
 	u64	hop2_shift;
 	u64	hop3_shift;
 	u64	hop4_shift;
+	u64	hop5_shift;
 	u64	hop0_mask;
 	u64	hop1_mask;
 	u64	hop2_mask;
 	u64	hop3_mask;
 	u64	hop4_mask;
+	u64	hop5_mask;
 	u32	page_size;
+	u32	num_hops;
 };
 
 /**
  * struct asic_fixed_properties - ASIC specific immutable properties.
  * @hw_queues_props: H/W queues properties.
- * @armcp_info: received various information from ArmCP regarding the H/W, e.g.
+ * @cpucp_info: received various information from CPU-CP regarding the H/W, e.g.
  *		available sensors.
  * @uboot_ver: F/W U-boot version.
  * @preboot_ver: F/W Preboot version.
@@ -271,6 +291,10 @@ struct hl_mmu_properties {
  * @pcie_aux_dbi_reg_addr: Address of the PCIE_AUX DBI register.
  * @mmu_pgt_addr: base physical address in DRAM of MMU page tables.
  * @mmu_dram_default_page_addr: DRAM default page physical address.
+ * @cb_va_start_addr: virtual start address of command buffers which are mapped
+ *                    to the device's MMU.
+ * @cb_va_end_addr: virtual end address of command buffers which are mapped to
+ *                  the device's MMU.
  * @mmu_pgt_size: MMU page tables total size.
  * @mmu_pte_size: PTE size in MMU page tables.
  * @mmu_hop_table_size: MMU hop table size.
@@ -292,12 +316,16 @@ struct hl_mmu_properties {
  * @max_queues: maximum amount of queues in the system
  * @sync_stream_first_sob: first sync object available for sync stream use
  * @sync_stream_first_mon: first monitor available for sync stream use
+ * @first_available_user_sob: first sob available for the user
+ * @first_available_user_mon: first monitor available for the user
  * @tpc_enabled_mask: which TPCs are enabled.
  * @completion_queues_count: number of completion queues.
+ * @fw_security_disabled: true if security measures are disabled in firmware,
+ *                        false otherwise
  */
 struct asic_fixed_properties {
 	struct hw_queue_properties	*hw_queues_props;
-	struct armcp_info		armcp_info;
+	struct cpucp_info		cpucp_info;
 	char				uboot_ver[VERSION_MAX_LEN];
 	char				preboot_ver[VERSION_MAX_LEN];
 	struct hl_mmu_properties	dmmu;
@@ -317,6 +345,8 @@ struct asic_fixed_properties {
 	u64				pcie_aux_dbi_reg_addr;
 	u64				mmu_pgt_addr;
 	u64				mmu_dram_default_page_addr;
+	u64				cb_va_start_addr;
+	u64				cb_va_end_addr;
 	u32				mmu_pgt_size;
 	u32				mmu_pte_size;
 	u32				mmu_hop_table_size;
@@ -338,13 +368,29 @@ struct asic_fixed_properties {
 	u32				max_queues;
 	u16				sync_stream_first_sob;
 	u16				sync_stream_first_mon;
+	u16				first_available_user_sob[HL_MAX_DCORES];
+	u16				first_available_user_mon[HL_MAX_DCORES];
 	u8				tpc_enabled_mask;
 	u8				completion_queues_count;
+	u8				fw_security_disabled;
+};
+
+/**
+ * struct hl_fence - software synchronization primitive
+ * @completion: fence is implemented using completion
+ * @refcount: refcount for this fence
+ * @error: mark this fence with error
+ *
+ */
+struct hl_fence {
+	struct completion	completion;
+	struct kref		refcount;
+	int			error;
 };
 
 /**
  * struct hl_cs_compl - command submission completion object.
- * @base_fence: kernel fence object.
+ * @base_fence: hl fence object.
  * @lock: spinlock to protect fence.
  * @hdev: habanalabs device structure.
  * @hw_sob: the H/W SOB used in this signal/wait CS.
@@ -353,7 +399,7 @@ struct asic_fixed_properties {
  * @sob_val: the SOB value that is used in this signal/wait CS.
  */
 struct hl_cs_compl {
-	struct dma_fence	base_fence;
+	struct hl_fence		base_fence;
 	spinlock_t		lock;
 	struct hl_device	*hdev;
 	struct hl_hw_sob	*hw_sob;
@@ -380,36 +426,41 @@ struct hl_cb_mgr {
  * struct hl_cb - describes a Command Buffer.
  * @refcount: reference counter for usage of the CB.
  * @hdev: pointer to device this CB belongs to.
+ * @ctx: pointer to the CB owner's context.
  * @lock: spinlock to protect mmap/cs flows.
  * @debugfs_list: node in debugfs list of command buffers.
  * @pool_list: node in pool list of command buffers.
+ * @va_block_list: list of virtual addresses blocks of the CB if it is mapped to
+ *                 the device's MMU.
+ * @id: the CB's ID.
  * @kernel_address: Holds the CB's kernel virtual address.
  * @bus_address: Holds the CB's DMA address.
  * @mmap_size: Holds the CB's size that was mmaped.
  * @size: holds the CB's size.
- * @id: the CB's ID.
  * @cs_cnt: holds number of CS that this CB participates in.
- * @ctx_id: holds the ID of the owner's context.
  * @mmap: true if the CB is currently mmaped to user.
  * @is_pool: true if CB was acquired from the pool, false otherwise.
  * @is_internal: internaly allocated
+ * @is_mmu_mapped: true if the CB is mapped to the device's MMU.
  */
 struct hl_cb {
 	struct kref		refcount;
 	struct hl_device	*hdev;
+	struct hl_ctx		*ctx;
 	spinlock_t		lock;
 	struct list_head	debugfs_list;
 	struct list_head	pool_list;
-	u64			kernel_address;
+	struct list_head	va_block_list;
+	u64			id;
+	void			*kernel_address;
 	dma_addr_t		bus_address;
 	u32			mmap_size;
 	u32			size;
-	u32			id;
 	u32			cs_cnt;
-	u32			ctx_id;
 	u8			mmap;
 	u8			is_pool;
 	u8			is_internal;
+	u8			is_mmu_mapped;
 };
 
 
@@ -435,7 +486,7 @@ struct hl_cs_job;
 #define HL_EQ_LENGTH			64
 #define HL_EQ_SIZE_IN_BYTES		(HL_EQ_LENGTH * HL_EQ_ENTRY_SIZE)
 
-/* Host <-> ArmCP shared memory size */
+/* Host <-> CPU-CP shared memory size */
 #define HL_CPU_ACCESSIBLE_MEM_SIZE	SZ_2M
 
 /**
@@ -464,7 +515,7 @@ struct hl_hw_queue {
 	struct hl_hw_sob	hw_sob[HL_RSVD_SOBS];
 	struct hl_cs_job	**shadow_queue;
 	enum hl_queue_type	queue_type;
-	u64			kernel_address;
+	void			*kernel_address;
 	dma_addr_t		bus_address;
 	u32			pi;
 	atomic_t		ci;
@@ -493,7 +544,7 @@ struct hl_hw_queue {
  */
 struct hl_cq {
 	struct hl_device	*hdev;
-	u64			kernel_address;
+	void			*kernel_address;
 	dma_addr_t		bus_address;
 	u32			cq_idx;
 	u32			hw_queue_id;
@@ -511,7 +562,7 @@ struct hl_cq {
  */
 struct hl_eq {
 	struct hl_device	*hdev;
-	u64			kernel_address;
+	void			*kernel_address;
 	dma_addr_t		bus_address;
 	u32			ci;
 };
@@ -617,7 +668,7 @@ enum div_select_defs {
  * @debugfs_read32: debug interface for reading u32 from DRAM/SRAM.
  * @debugfs_write32: debug interface for writing u32 to DRAM/SRAM.
  * @add_device_attr: add ASIC specific device attributes.
- * @handle_eqe: handle event queue entry (IRQ) from ArmCP.
+ * @handle_eqe: handle event queue entry (IRQ) from CPU-CP.
  * @set_pll_profile: change PLL profile (manual/automatic).
  * @get_events_stat: retrieve event queue entries histogram.
  * @read_pte: read MMU page table entry from DRAM.
@@ -626,7 +677,7 @@ enum div_select_defs {
  *                        (L1 only) or hard (L0 & L1) flush.
  * @mmu_invalidate_cache_range: flush specific MMU STLB cache lines with
  *                              ASID-VA-size mask.
- * @send_heartbeat: send is-alive packet to ArmCP and verify response.
+ * @send_heartbeat: send is-alive packet to CPU-CP and verify response.
  * @set_clock_gating: enable/disable clock gating per engine according to
  *                    clock gating mask in hdev
  * @disable_clock_gating: disable clock gating completely
@@ -644,8 +695,6 @@ enum div_select_defs {
  *                    ASIC
  * @get_hw_state: retrieve the H/W state
  * @pci_bars_map: Map PCI BARs.
- * @set_dram_bar_base: Set DRAM BAR to map specific device address. Returns
- *                     old address the bar pointed to or U64_MAX for failure
  * @init_iatu: Initialize the iATU unit inside the PCI controller.
  * @rreg: Read a register. Needed for simulator support.
  * @wreg: Write a register. Needed for simulator support.
@@ -679,7 +728,7 @@ struct hl_asic_funcs {
 	int (*suspend)(struct hl_device *hdev);
 	int (*resume)(struct hl_device *hdev);
 	int (*cb_mmap)(struct hl_device *hdev, struct vm_area_struct *vma,
-			u64 kaddress, phys_addr_t paddress, u32 size);
+			void *cpu_addr, dma_addr_t dma_addr, size_t size);
 	void (*ring_doorbell)(struct hl_device *hdev, u32 hw_queue_id, u32 pi);
 	void (*pqe_write)(struct hl_device *hdev, __le64 *pqe,
 			struct hl_bd *bd);
@@ -708,7 +757,7 @@ struct hl_asic_funcs {
 	u32 (*get_dma_desc_list_size)(struct hl_device *hdev,
 					struct sg_table *sgt);
 	void (*add_end_of_cb_packets)(struct hl_device *hdev,
-					u64 kernel_address, u32 len,
+					void *kernel_address, u32 len,
 					u64 cq_addr, u32 cq_val, u32 msix_num,
 					bool eb);
 	void (*update_eq_ci)(struct hl_device *hdev, u32 val);
@@ -736,7 +785,7 @@ struct hl_asic_funcs {
 	void (*set_clock_gating)(struct hl_device *hdev);
 	void (*disable_clock_gating)(struct hl_device *hdev);
 	int (*debug_coresight)(struct hl_device *hdev, void *data);
-	bool (*is_device_idle)(struct hl_device *hdev, u32 *mask,
+	bool (*is_device_idle)(struct hl_device *hdev, u64 *mask,
 				struct seq_file *s);
 	int (*soft_reset_late_init)(struct hl_device *hdev);
 	void (*hw_queues_lock)(struct hl_device *hdev);
@@ -748,7 +797,6 @@ struct hl_asic_funcs {
 				u16 len, u32 timeout, long *result);
 	enum hl_device_hw_state (*get_hw_state)(struct hl_device *hdev);
 	int (*pci_bars_map)(struct hl_device *hdev);
-	u64 (*set_dram_bar_base)(struct hl_device *hdev, u64 addr);
 	int (*init_iatu)(struct hl_device *hdev);
 	u32 (*rreg)(struct hl_device *hdev, u32 reg);
 	void (*wreg)(struct hl_device *hdev, u32 reg, u32 val);
@@ -800,7 +848,7 @@ struct hl_va_range {
  * @hdev: pointer to the device structure.
  * @refcount: reference counter for the context. Context is released only when
  *		this hits 0l. It is incremented on CS and CS_WAIT.
- * @cs_pending: array of DMA fence objects representing pending CS.
+ * @cs_pending: array of hl fence objects representing pending CS.
  * @host_va_range: holds available virtual addresses for host mappings.
  * @host_huge_va_range: holds available virtual addresses for host mappings
  *                      with huge pages.
@@ -809,6 +857,8 @@ struct hl_va_range {
  * @mmu_lock: protects the MMU page tables. Any change to the PGT, modifying the
  *            MMU hash or walking the PGT requires talking this lock.
  * @debugfs_list: node in debugfs list of contexts.
+ * @cb_va_pool: device VA pool for command buffers which are mapped to the
+ *              device's MMU.
  * @cs_sequence: sequence number for CS. Value is assigned to a CS and passed
  *			to user so user could inquire about CS. It is used as
  *			index to cs_pending array.
@@ -832,7 +882,7 @@ struct hl_ctx {
 	struct hl_fpriv		*hpriv;
 	struct hl_device	*hdev;
 	struct kref		refcount;
-	struct dma_fence	**cs_pending;
+	struct hl_fence		**cs_pending;
 	struct hl_va_range	*host_va_range;
 	struct hl_va_range	*host_huge_va_range;
 	struct hl_va_range	*dram_va_range;
@@ -840,6 +890,7 @@ struct hl_ctx {
 	struct mutex		mmu_lock;
 	struct list_head	debugfs_list;
 	struct hl_cs_counters	cs_counters;
+	struct gen_pool		*cb_va_pool;
 	u64			cs_sequence;
 	u64			*dram_default_hops;
 	spinlock_t		cs_lock;
@@ -919,8 +970,8 @@ struct hl_cs {
 	struct list_head	job_list;
 	spinlock_t		job_lock;
 	struct kref		refcount;
-	struct dma_fence	*fence;
-	struct dma_fence	*signal_fence;
+	struct hl_fence		*fence;
+	struct hl_fence		*signal_fence;
 	struct work_struct	finish_work;
 	struct delayed_work	work_tdr;
 	struct list_head	mirror_node;
@@ -1331,13 +1382,13 @@ void hl_wreg(struct hl_device *hdev, u32 reg, u32 val);
 	for (;;) { \
 		/* Verify we read updates done by other cores or by device */ \
 		mb(); \
-		(val) = *((u32 *) (uintptr_t) (addr)); \
+		(val) = *((u32 *)(addr)); \
 		if (mem_written_by_device) \
 			(val) = le32_to_cpu(*(__le32 *) &(val)); \
 		if (cond) \
 			break; \
 		if (timeout_us && ktime_compare(ktime_get(), __timeout) > 0) { \
-			(val) = *((u32 *) (uintptr_t) (addr)); \
+			(val) = *((u32 *)(addr)); \
 			if (mem_written_by_device) \
 				(val) = le32_to_cpu(*(__le32 *) &(val)); \
 			break; \
@@ -1395,6 +1446,44 @@ struct hl_device_idle_busy_ts {
 	ktime_t				busy_to_idle_ts;
 };
 
+
+/**
+ * struct hl_mmu_priv - used for holding per-device mmu internal information.
+ * @mmu_pgt_pool: pool of page tables used by MMU for allocating hops.
+ * @mmu_shadow_hop0: shadow array of hop0 tables.
+ */
+struct hl_mmu_priv {
+	struct gen_pool *mmu_pgt_pool;
+	void *mmu_shadow_hop0;
+};
+
+/**
+ * struct hl_mmu_funcs - Device related MMU functions.
+ * @init: initialize the MMU module.
+ * @fini: release the MMU module.
+ * @ctx_init: Initialize a context for using the MMU module.
+ * @ctx_fini: disable a ctx from using the mmu module.
+ * @map: maps a virtual address to physical address for a context.
+ * @unmap: unmap a virtual address of a context.
+ * @flush: flush all writes from all cores to reach device MMU.
+ * @swap_out: marks all mapping of the given context as swapped out.
+ * @swap_in: marks all mapping of the given context as swapped in.
+ */
+struct hl_mmu_funcs {
+	int (*init)(struct hl_device *hdev);
+	void (*fini)(struct hl_device *hdev);
+	int (*ctx_init)(struct hl_ctx *ctx);
+	void (*ctx_fini)(struct hl_ctx *ctx);
+	int (*map)(struct hl_ctx *ctx,
+			u64 virt_addr, u64 phys_addr, u32 page_size,
+			bool is_dram_addr);
+	int (*unmap)(struct hl_ctx *ctx,
+			u64 virt_addr, bool is_dram_addr);
+	void (*flush)(struct hl_ctx *ctx);
+	void (*swap_out)(struct hl_ctx *ctx);
+	void (*swap_in)(struct hl_ctx *ctx);
+};
+
 /**
  * struct hl_device - habanalabs device structure.
  * @pdev: pointer to PCI device, can be NULL in case of simulator device.
@@ -1407,8 +1496,8 @@ struct hl_device_idle_busy_ts {
  * @dev: related kernel basic device structure.
  * @dev_ctrl: related kernel device structure for the control device
  * @work_freq: delayed work to lower device frequency if possible.
- * @work_heartbeat: delayed work for ArmCP is-alive check.
- * @asic_name: ASIC specific nmae.
+ * @work_heartbeat: delayed work for CPU-CP is-alive check.
+ * @asic_name: ASIC specific name.
  * @asic_type: ASIC specific type.
  * @completion_queue: array of hl_cq.
  * @cq_wq: work queues of completion queues for executing work in process
@@ -1419,22 +1508,20 @@ struct hl_device_idle_busy_ts {
  * @hw_queues_mirror_list: CS mirror list for TDR.
  * @hw_queues_mirror_lock: protects hw_queues_mirror_list.
  * @kernel_cb_mgr: command buffer manager for creating/destroying/handling CGs.
- * @event_queue: event queue for IRQ from ArmCP.
+ * @event_queue: event queue for IRQ from CPU-CP.
  * @dma_pool: DMA pool for small allocations.
- * @cpu_accessible_dma_mem: Host <-> ArmCP shared memory CPU address.
- * @cpu_accessible_dma_address: Host <-> ArmCP shared memory DMA address.
- * @cpu_accessible_dma_pool: Host <-> ArmCP shared memory pool.
+ * @cpu_accessible_dma_mem: Host <-> CPU-CP shared memory CPU address.
+ * @cpu_accessible_dma_address: Host <-> CPU-CP shared memory DMA address.
+ * @cpu_accessible_dma_pool: Host <-> CPU-CP shared memory pool.
  * @asid_bitmap: holds used/available ASIDs.
  * @asid_mutex: protects asid_bitmap.
- * @send_cpu_message_lock: enforces only one message in Host <-> ArmCP queue.
+ * @send_cpu_message_lock: enforces only one message in Host <-> CPU-CP queue.
  * @debug_lock: protects critical section of setting debug mode for device
  * @asic_prop: ASIC specific immutable properties.
  * @asic_funcs: ASIC specific functions.
  * @asic_specific: ASIC specific information to use only from ASIC files.
- * @mmu_pgt_pool: pool of available MMU hops.
  * @vm: virtual memory manager for MMU.
  * @mmu_cache_lock: protects MMU cache invalidation as it can serve one context.
- * @mmu_shadow_hop0: shadow mapping of the MMU hop 0 zone.
  * @hwmon_dev: H/W monitor device.
  * @pm_mng_profile: current power management profile.
  * @hl_chip_info: ASIC's sensors information.
@@ -1452,6 +1539,8 @@ struct hl_device_idle_busy_ts {
  * @idle_busy_ts_arr: array to hold time stamps of transitions from idle to busy
  *                    and vice-versa
  * @aggregated_cs_counters: aggregated cs counters among all contexts
+ * @mmu_priv: device-specific MMU data.
+ * @mmu_func: device-related MMU functions.
  * @dram_used_mem: current DRAM memory consumption.
  * @timeout_jiffies: device CS timeout value.
  * @max_power: the max power of the device, as configured by the sysadmin. This
@@ -1462,6 +1551,8 @@ struct hl_device_idle_busy_ts {
  *                     details.
  * @in_reset: is device in reset flow.
  * @curr_pll_profile: current PLL profile.
+ * @card_type: Various ASICs have several card types. This indicates the card
+ *             type of the current device.
  * @cs_active_cnt: number of active command submissions on this device (active
  *                 means already in H/W queues)
  * @major: habanalabs kernel driver major.
@@ -1469,6 +1560,7 @@ struct hl_device_idle_busy_ts {
  * @soft_reset_cnt: number of soft reset since the driver was loaded.
  * @hard_reset_cnt: number of hard reset since the driver was loaded.
  * @idle_busy_ts_idx: index of current entry in idle_busy_ts_arr
+ * @clk_throttling_reason: bitmask represents the current clk throttling reasons
  * @id: device minor.
  * @id_control: minor of the control device
  * @cpu_pci_msb_addr: 50-bit extension bits for the device CPU's 40-bit
@@ -1477,7 +1569,7 @@ struct hl_device_idle_busy_ts {
  * @late_init_done: is late init stage was done during initialization.
  * @hwmon_initialized: is H/W monitor sensors was initialized.
  * @hard_reset_pending: is there a hard reset work pending.
- * @heartbeat: is heartbeat sanity check towards ArmCP enabled.
+ * @heartbeat: is heartbeat sanity check towards CPU-CP enabled.
  * @reset_on_lockup: true if a reset should be done in case of stuck CS, false
  *                   otherwise.
  * @dram_supports_virtual_memory: is MMU enabled towards DRAM.
@@ -1499,6 +1591,7 @@ struct hl_device_idle_busy_ts {
  * @sync_stream_queue_idx: helper index for sync stream queues initialization.
  * @supports_coresight: is CoreSight supported.
  * @supports_soft_reset: is soft reset supported.
+ * @supports_cb_mapping: is mapping a CB to the device's MMU supported.
  */
 struct hl_device {
 	struct pci_dev			*pdev;
@@ -1511,7 +1604,7 @@ struct hl_device {
 	struct device			*dev_ctrl;
 	struct delayed_work		work_freq;
 	struct delayed_work		work_heartbeat;
-	char				asic_name[16];
+	char				asic_name[32];
 	enum hl_asic_type		asic_type;
 	struct hl_cq			*completion_queue;
 	struct workqueue_struct		**cq_wq;
@@ -1533,10 +1626,8 @@ struct hl_device {
 	struct asic_fixed_properties	asic_prop;
 	const struct hl_asic_funcs	*asic_funcs;
 	void				*asic_specific;
-	struct gen_pool			*mmu_pgt_pool;
 	struct hl_vm			vm;
 	struct mutex			mmu_cache_lock;
-	void				*mmu_shadow_hop0;
 	struct device			*hwmon_dev;
 	enum hl_pm_mng_profile		pm_mng_profile;
 	struct hwmon_chip_info		*hl_chip_info;
@@ -1560,18 +1651,23 @@ struct hl_device {
 
 	struct hl_cs_counters		aggregated_cs_counters;
 
+	struct hl_mmu_priv		mmu_priv;
+	struct hl_mmu_funcs		mmu_func;
+
 	atomic64_t			dram_used_mem;
 	u64				timeout_jiffies;
 	u64				max_power;
 	u64				clock_gating_mask;
 	atomic_t			in_reset;
 	enum hl_pll_frequency		curr_pll_profile;
+	enum cpucp_card_types		card_type;
 	int				cs_active_cnt;
 	u32				major;
 	u32				high_pll;
 	u32				soft_reset_cnt;
 	u32				hard_reset_cnt;
 	u32				idle_busy_ts_idx;
+	u32				clk_throttling_reason;
 	u16				id;
 	u16				id_control;
 	u16				cpu_pci_msb_addr;
@@ -1595,6 +1691,7 @@ struct hl_device {
 	u8				sync_stream_queue_idx;
 	u8				supports_coresight;
 	u8				supports_soft_reset;
+	u8				supports_cb_mapping;
 
 	/* Parameters for bring-up */
 	u8				mmu_enable;
@@ -1651,7 +1748,7 @@ struct hl_ioctl_desc {
  *
  * Return: true if the area is inside the valid range, false otherwise.
  */
-static inline bool hl_mem_area_inside_range(u64 address, u32 size,
+static inline bool hl_mem_area_inside_range(u64 address, u64 size,
 				u64 range_start_address, u64 range_end_address)
 {
 	u64 end_address = address + size;
@@ -1736,7 +1833,7 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx);
 void hl_ctx_do_release(struct kref *ref);
 void hl_ctx_get(struct hl_device *hdev,	struct hl_ctx *ctx);
 int hl_ctx_put(struct hl_ctx *ctx);
-struct dma_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq);
+struct hl_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq);
 void hl_ctx_mgr_init(struct hl_ctx_mgr *mgr);
 void hl_ctx_mgr_fini(struct hl_device *hdev, struct hl_ctx_mgr *mgr);
 
@@ -1752,7 +1849,7 @@ int hl_device_set_frequency(struct hl_device *hdev, enum hl_pll_frequency freq);
 uint32_t hl_device_utilization(struct hl_device *hdev, uint32_t period_ms);
 
 int hl_build_hwmon_channel_info(struct hl_device *hdev,
-		struct armcp_sensor *sensors_arr);
+		struct cpucp_sensor *sensors_arr);
 
 int hl_sysfs_init(struct hl_device *hdev);
 void hl_sysfs_fini(struct hl_device *hdev);
@@ -1760,8 +1857,9 @@ void hl_sysfs_fini(struct hl_device *hdev);
 int hl_hwmon_init(struct hl_device *hdev);
 void hl_hwmon_fini(struct hl_device *hdev);
 
-int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr, u32 cb_size,
-		u64 *handle, int ctx_id, bool internal_cb);
+int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
+			struct hl_ctx *ctx, u32 cb_size, bool internal_cb,
+			bool map_cb, u64 *handle);
 int hl_cb_destroy(struct hl_device *hdev, struct hl_cb_mgr *mgr, u64 cb_handle);
 int hl_cb_mmap(struct hl_fpriv *hpriv, struct vm_area_struct *vma);
 struct hl_cb *hl_cb_get(struct hl_device *hdev,	struct hl_cb_mgr *mgr,
@@ -1773,11 +1871,15 @@ struct hl_cb *hl_cb_kernel_create(struct hl_device *hdev, u32 cb_size,
 					bool internal_cb);
 int hl_cb_pool_init(struct hl_device *hdev);
 int hl_cb_pool_fini(struct hl_device *hdev);
+int hl_cb_va_pool_init(struct hl_ctx *ctx);
+void hl_cb_va_pool_fini(struct hl_ctx *ctx);
 
 void hl_cs_rollback_all(struct hl_device *hdev);
 struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev,
 		enum hl_queue_type queue_type, bool is_kernel_allocated_cb);
 void hl_sob_reset_error(struct kref *ref);
+void hl_fence_put(struct hl_fence *fence);
+void hl_fence_get(struct hl_fence *fence);
 
 void goya_set_asic_funcs(struct hl_device *hdev);
 void gaudi_set_asic_funcs(struct hl_device *hdev);
@@ -1807,6 +1909,8 @@ int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size,
 		bool flush_pte);
 void hl_mmu_swap_out(struct hl_ctx *ctx);
 void hl_mmu_swap_in(struct hl_ctx *ctx);
+int hl_mmu_if_set_funcs(struct hl_device *hdev);
+void hl_mmu_v1_set_funcs(struct hl_device *hdev);
 
 int hl_fw_load_fw_to_device(struct hl_device *hdev, const char *fw_name,
 				void __iomem *dst);
@@ -1822,23 +1926,28 @@ void *hl_fw_cpu_accessible_dma_pool_alloc(struct hl_device *hdev, size_t size,
 void hl_fw_cpu_accessible_dma_pool_free(struct hl_device *hdev, size_t size,
 					void *vaddr);
 int hl_fw_send_heartbeat(struct hl_device *hdev);
-int hl_fw_armcp_info_get(struct hl_device *hdev);
+int hl_fw_cpucp_info_get(struct hl_device *hdev);
 int hl_fw_get_eeprom_data(struct hl_device *hdev, void *data, size_t max_size);
+int hl_fw_cpucp_pci_counters_get(struct hl_device *hdev,
+		struct hl_info_pci_counters *counters);
+int hl_fw_cpucp_total_energy_get(struct hl_device *hdev,
+			u64 *total_energy);
 int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
 			u32 msg_to_cpu_reg, u32 cpu_msg_status_reg,
 			u32 boot_err0_reg, bool skip_bmc,
 			u32 cpu_timeout, u32 boot_fit_timeout);
+int hl_fw_read_preboot_ver(struct hl_device *hdev, u32 cpu_boot_status_reg,
+				u32 boot_err0_reg, u32 timeout);
 
 int hl_pci_bars_map(struct hl_device *hdev, const char * const name[3],
 			bool is_wc[3]);
 int hl_pci_iatu_write(struct hl_device *hdev, u32 addr, u32 data);
-int hl_pci_set_dram_bar_base(struct hl_device *hdev, u8 inbound_region, u8 bar,
-				u64 addr);
 int hl_pci_set_inbound_region(struct hl_device *hdev, u8 region,
 		struct hl_inbound_pci_region *pci_region);
 int hl_pci_set_outbound_region(struct hl_device *hdev,
 		struct hl_outbound_pci_region *pci_region);
-int hl_pci_init(struct hl_device *hdev);
+int hl_pci_init(struct hl_device *hdev, u32 cpu_boot_status_reg,
+		u32 boot_err0_reg, u32 preboot_ver_timeout);
 void hl_pci_fini(struct hl_device *hdev);
 
 long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr);
@@ -1858,7 +1967,7 @@ int hl_get_pwm_info(struct hl_device *hdev,
 void hl_set_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr,
 			long value);
 u64 hl_get_max_power(struct hl_device *hdev);
-void hl_set_max_power(struct hl_device *hdev, u64 value);
+void hl_set_max_power(struct hl_device *hdev);
 int hl_set_voltage(struct hl_device *hdev,
 			int sensor_index, u32 attr, long value);
 int hl_set_current(struct hl_device *hdev,
diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c
index c6b31e93fb5e..f9067d3ef437 100644
--- a/drivers/misc/habanalabs/common/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@@ -11,6 +11,7 @@
 #include "habanalabs.h"
 
 #include <linux/pci.h>
+#include <linux/aer.h>
 #include <linux/module.h>
 
 #define HL_DRIVER_AUTHOR	"HabanaLabs Kernel Driver Team"
@@ -408,6 +409,8 @@ static int hl_pci_probe(struct pci_dev *pdev,
 
 	pci_set_drvdata(pdev, hdev);
 
+	pci_enable_pcie_error_reporting(pdev);
+
 	rc = hl_device_init(hdev, hl_class);
 	if (rc) {
 		dev_err(&pdev->dev, "Fatal error during habanalabs device init\n");
@@ -440,22 +443,93 @@ static void hl_pci_remove(struct pci_dev *pdev)
 		return;
 
 	hl_device_fini(hdev);
+	pci_disable_pcie_error_reporting(pdev);
 	pci_set_drvdata(pdev, NULL);
-
 	destroy_hdev(hdev);
 }
 
+/**
+ * hl_pci_err_detected - a PCI bus error detected on this device
+ *
+ * @pdev: pointer to pci device
+ * @state: PCI error type
+ *
+ * Called by the PCI subsystem whenever a non-correctable
+ * PCI bus error is detected
+ */
+static pci_ers_result_t
+hl_pci_err_detected(struct pci_dev *pdev, pci_channel_state_t state)
+{
+	struct hl_device *hdev = pci_get_drvdata(pdev);
+	enum pci_ers_result result;
+
+	switch (state) {
+	case pci_channel_io_normal:
+		return PCI_ERS_RESULT_CAN_RECOVER;
+
+	case pci_channel_io_frozen:
+		dev_warn(hdev->dev, "frozen state error detected\n");
+		result = PCI_ERS_RESULT_NEED_RESET;
+		break;
+
+	case pci_channel_io_perm_failure:
+		dev_warn(hdev->dev, "failure state error detected\n");
+		result = PCI_ERS_RESULT_DISCONNECT;
+		break;
+
+	default:
+		result = PCI_ERS_RESULT_NONE;
+	}
+
+	hdev->asic_funcs->halt_engines(hdev, true);
+
+	return result;
+}
+
+/**
+ * hl_pci_err_resume - resume after a PCI slot reset
+ *
+ * @pdev: pointer to pci device
+ *
+ */
+static void hl_pci_err_resume(struct pci_dev *pdev)
+{
+	struct hl_device *hdev = pci_get_drvdata(pdev);
+
+	dev_warn(hdev->dev, "Resuming device after PCI slot reset\n");
+	hl_device_resume(hdev);
+}
+
+/**
+ * hl_pci_err_slot_reset - a PCI slot reset has just happened
+ *
+ * @pdev: pointer to pci device
+ *
+ * Determine if the driver can recover from the PCI slot reset
+ */
+static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev)
+{
+	return PCI_ERS_RESULT_RECOVERED;
+}
+
 static const struct dev_pm_ops hl_pm_ops = {
 	.suspend = hl_pmops_suspend,
 	.resume = hl_pmops_resume,
 };
 
+static const struct pci_error_handlers hl_pci_err_handler = {
+	.error_detected = hl_pci_err_detected,
+	.slot_reset = hl_pci_err_slot_reset,
+	.resume = hl_pci_err_resume,
+};
+
 static struct pci_driver hl_pci_driver = {
 	.name = HL_NAME,
 	.id_table = ids,
 	.probe = hl_pci_probe,
 	.remove = hl_pci_remove,
 	.driver.pm = &hl_pm_ops,
+	.err_handler = &hl_pci_err_handler,
 };
 
 /*
diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
index 5af1c03da473..07317ea49129 100644
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -8,6 +8,7 @@
 #include <uapi/misc/habanalabs.h>
 #include "habanalabs.h"
 
+#include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/uaccess.h>
 #include <linux/slab.h>
@@ -64,14 +65,14 @@ static int hw_ip_info(struct hl_device *hdev, struct hl_info_args *args)
 		hw_ip.dram_enabled = 1;
 	hw_ip.num_of_events = prop->num_of_events;
 
-	memcpy(hw_ip.armcp_version, prop->armcp_info.armcp_version,
+	memcpy(hw_ip.cpucp_version, prop->cpucp_info.cpucp_version,
 		min(VERSION_MAX_LEN, HL_INFO_VERSION_MAX_LEN));
 
-	memcpy(hw_ip.card_name, prop->armcp_info.card_name,
+	memcpy(hw_ip.card_name, prop->cpucp_info.card_name,
 		min(CARD_NAME_MAX_LEN, HL_INFO_CARD_NAME_MAX_LEN));
 
-	hw_ip.armcp_cpld_version = le32_to_cpu(prop->armcp_info.cpld_version);
-	hw_ip.module_id = le32_to_cpu(prop->armcp_info.card_location);
+	hw_ip.cpld_version = le32_to_cpu(prop->cpucp_info.cpld_version);
+	hw_ip.module_id = le32_to_cpu(prop->cpucp_info.card_location);
 
 	hw_ip.psoc_pci_pll_nr = prop->psoc_pci_pll_nr;
 	hw_ip.psoc_pci_pll_nf = prop->psoc_pci_pll_nf;
@@ -131,7 +132,7 @@ static int hw_idle(struct hl_device *hdev, struct hl_info_args *args)
 		return -EINVAL;
 
 	hw_idle.is_idle = hdev->asic_funcs->is_device_idle(hdev,
-					&hw_idle.busy_engines_mask, NULL);
+					&hw_idle.busy_engines_mask_ext, NULL);
 
 	return copy_to_user(out, &hw_idle,
 		min((size_t) max_size, sizeof(hw_idle))) ? -EFAULT : 0;
@@ -276,10 +277,45 @@ static int time_sync_info(struct hl_device *hdev, struct hl_info_args *args)
 		min((size_t) max_size, sizeof(time_sync))) ? -EFAULT : 0;
 }
 
+static int pci_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	struct hl_info_pci_counters pci_counters = {0};
+	u32 max_size = args->return_size;
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+	int rc;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	rc = hl_fw_cpucp_pci_counters_get(hdev, &pci_counters);
+	if (rc)
+		return rc;
+
+	return copy_to_user(out, &pci_counters,
+		min((size_t) max_size, sizeof(pci_counters))) ? -EFAULT : 0;
+}
+
+static int clk_throttle_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	struct hl_info_clk_throttle clk_throttle = {0};
+	u32 max_size = args->return_size;
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	clk_throttle.clk_throttling_reason = hdev->clk_throttling_reason;
+
+	return copy_to_user(out, &clk_throttle,
+		min((size_t) max_size, sizeof(clk_throttle))) ? -EFAULT : 0;
+}
+
 static int cs_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 {
 	struct hl_device *hdev = hpriv->hdev;
-	struct hl_info_cs_counters cs_counters = {0};
+	struct hl_info_cs_counters cs_counters = { {0} };
 	u32 max_size = args->return_size;
 	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
 
@@ -297,6 +333,51 @@ static int cs_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 		min((size_t) max_size, sizeof(cs_counters))) ? -EFAULT : 0;
 }
 
+static int sync_manager_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct hl_info_sync_manager sm_info = {0};
+	u32 max_size = args->return_size;
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	if (args->dcore_id >= HL_MAX_DCORES)
+		return -EINVAL;
+
+	sm_info.first_available_sync_object =
+			prop->first_available_user_sob[args->dcore_id];
+	sm_info.first_available_monitor =
+			prop->first_available_user_mon[args->dcore_id];
+
+
+	return copy_to_user(out, &sm_info, min_t(size_t, (size_t) max_size,
+			sizeof(sm_info))) ? -EFAULT : 0;
+}
+
+static int total_energy_consumption_info(struct hl_fpriv *hpriv,
+			struct hl_info_args *args)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	struct hl_info_energy total_energy = {0};
+	u32 max_size = args->return_size;
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+	int rc;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	rc = hl_fw_cpucp_total_energy_get(hdev,
+			&total_energy.total_energy_consumption);
+	if (rc)
+		return rc;
+
+	return copy_to_user(out, &total_energy,
+		min((size_t) max_size, sizeof(total_energy))) ? -EFAULT : 0;
+}
+
 static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 				struct device *dev)
 {
@@ -360,6 +441,18 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 	case HL_INFO_CS_COUNTERS:
 		return cs_counters_info(hpriv, args);
 
+	case HL_INFO_PCI_COUNTERS:
+		return pci_counters_info(hpriv, args);
+
+	case HL_INFO_CLK_THROTTLE_REASON:
+		return clk_throttle_info(hpriv, args);
+
+	case HL_INFO_SYNC_MANAGER:
+		return sync_manager_info(hpriv, args);
+
+	case HL_INFO_TOTAL_ENERGY:
+		return total_energy_consumption_info(hpriv, args);
+
 	default:
 		dev_err(dev, "Invalid request %d\n", args->op);
 		rc = -ENOTTY;
diff --git a/drivers/misc/habanalabs/common/hw_queue.c b/drivers/misc/habanalabs/common/hw_queue.c
index 287681646071..250cf9cefc06 100644
--- a/drivers/misc/habanalabs/common/hw_queue.c
+++ b/drivers/misc/habanalabs/common/hw_queue.c
@@ -75,7 +75,7 @@ static void ext_and_hw_queue_submit_bd(struct hl_device *hdev,
 {
 	struct hl_bd *bd;
 
-	bd = (struct hl_bd *) (uintptr_t) q->kernel_address;
+	bd = q->kernel_address;
 	bd += hl_pi_2_offset(q->pi);
 	bd->ctl = cpu_to_le32(ctl);
 	bd->len = cpu_to_le32(len);
@@ -288,10 +288,10 @@ static void ext_queue_schedule_job(struct hl_cs_job *job)
 	ptr = cb->bus_address;
 
 	cq_pkt.data = cpu_to_le32(
-				((q->pi << CQ_ENTRY_SHADOW_INDEX_SHIFT)
-					& CQ_ENTRY_SHADOW_INDEX_MASK) |
-				(1 << CQ_ENTRY_SHADOW_INDEX_VALID_SHIFT) |
-				(1 << CQ_ENTRY_READY_SHIFT));
+			((q->pi << CQ_ENTRY_SHADOW_INDEX_SHIFT)
+				& CQ_ENTRY_SHADOW_INDEX_MASK) |
+			FIELD_PREP(CQ_ENTRY_SHADOW_INDEX_VALID_MASK, 1) |
+			FIELD_PREP(CQ_ENTRY_READY_MASK, 1));
 
 	/*
 	 * No need to protect pi_offset because scheduling to the
@@ -335,8 +335,7 @@ static void int_queue_schedule_job(struct hl_cs_job *job)
 	bd.len = cpu_to_le32(job->job_cb_size);
 	bd.ptr = cpu_to_le64((u64) (uintptr_t) job->user_cb);
 
-	pi = (__le64 *) (uintptr_t) (q->kernel_address +
-		((q->pi & (q->int_queue_len - 1)) * sizeof(bd)));
+	pi = q->kernel_address + (q->pi & (q->int_queue_len - 1)) * sizeof(bd);
 
 	q->pi++;
 	q->pi &= ((q->int_queue_len << 1) - 1);
@@ -474,7 +473,7 @@ static void init_signal_wait_cs(struct hl_cs *cs)
 		 * wait CS was submitted.
 		 */
 		mb();
-		dma_fence_put(cs->signal_fence);
+		hl_fence_put(cs->signal_fence);
 		cs->signal_fence = NULL;
 	}
 }
@@ -630,7 +629,7 @@ static int ext_and_cpu_queue_init(struct hl_device *hdev, struct hl_hw_queue *q,
 	if (!p)
 		return -ENOMEM;
 
-	q->kernel_address = (u64) (uintptr_t) p;
+	q->kernel_address = p;
 
 	q->shadow_queue = kmalloc_array(HL_QUEUE_LENGTH,
 					sizeof(*q->shadow_queue),
@@ -653,11 +652,11 @@ free_queue:
 	if (is_cpu_queue)
 		hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev,
 					HL_QUEUE_SIZE_IN_BYTES,
-					(void *) (uintptr_t) q->kernel_address);
+					q->kernel_address);
 	else
 		hdev->asic_funcs->asic_dma_free_coherent(hdev,
 					HL_QUEUE_SIZE_IN_BYTES,
-					(void *) (uintptr_t) q->kernel_address,
+					q->kernel_address,
 					q->bus_address);
 
 	return rc;
@@ -676,7 +675,7 @@ static int int_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
 		return -EFAULT;
 	}
 
-	q->kernel_address = (u64) (uintptr_t) p;
+	q->kernel_address = p;
 	q->pi = 0;
 	atomic_set(&q->ci, 0);
 
@@ -704,7 +703,7 @@ static int hw_queue_init(struct hl_device *hdev, struct hl_hw_queue *q)
 	if (!p)
 		return -ENOMEM;
 
-	q->kernel_address = (u64) (uintptr_t) p;
+	q->kernel_address = p;
 
 	/* Make sure read/write pointers are initialized to start of queue */
 	atomic_set(&q->ci, 0);
@@ -839,11 +838,11 @@ static void queue_fini(struct hl_device *hdev, struct hl_hw_queue *q)
 	if (q->queue_type == QUEUE_TYPE_CPU)
 		hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev,
 					HL_QUEUE_SIZE_IN_BYTES,
-					(void *) (uintptr_t) q->kernel_address);
+					q->kernel_address);
 	else
 		hdev->asic_funcs->asic_dma_free_coherent(hdev,
 					HL_QUEUE_SIZE_IN_BYTES,
-					(void *) (uintptr_t) q->kernel_address,
+					q->kernel_address,
 					q->bus_address);
 }
 
diff --git a/drivers/misc/habanalabs/common/hwmon.c b/drivers/misc/habanalabs/common/hwmon.c
index b997336fa75f..2ac29cb2fe61 100644
--- a/drivers/misc/habanalabs/common/hwmon.c
+++ b/drivers/misc/habanalabs/common/hwmon.c
@@ -13,7 +13,7 @@
 #define HWMON_NR_SENSOR_TYPES		(hwmon_pwm + 1)
 
 int hl_build_hwmon_channel_info(struct hl_device *hdev,
-				struct armcp_sensor *sensors_arr)
+				struct cpucp_sensor *sensors_arr)
 {
 	u32 counts[HWMON_NR_SENSOR_TYPES] = {0};
 	u32 *sensors_by_type[HWMON_NR_SENSOR_TYPES] = {NULL};
@@ -24,7 +24,7 @@ int hl_build_hwmon_channel_info(struct hl_device *hdev,
 	enum hwmon_sensor_types type;
 	int rc, i, j;
 
-	for (i = 0 ; i < ARMCP_MAX_SENSORS ; i++) {
+	for (i = 0 ; i < CPUCP_MAX_SENSORS ; i++) {
 		type = le32_to_cpu(sensors_arr[i].type);
 
 		if ((type == 0) && (sensors_arr[i].flags == 0))
@@ -311,13 +311,13 @@ static const struct hwmon_ops hl_hwmon_ops = {
 int hl_get_temperature(struct hl_device *hdev,
 			int sensor_index, u32 attr, long *value)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
 
-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_TEMPERATURE_GET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEMPERATURE_GET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);
 
@@ -337,13 +337,13 @@ int hl_get_temperature(struct hl_device *hdev,
 int hl_set_temperature(struct hl_device *hdev,
 			int sensor_index, u32 attr, long value)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
 
-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_TEMPERATURE_SET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_TEMPERATURE_SET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);
 	pkt.value = __cpu_to_le64(value);
@@ -362,13 +362,13 @@ int hl_set_temperature(struct hl_device *hdev,
 int hl_get_voltage(struct hl_device *hdev,
 			int sensor_index, u32 attr, long *value)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
 
-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_VOLTAGE_GET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_VOLTAGE_GET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);
 
@@ -388,13 +388,13 @@ int hl_get_voltage(struct hl_device *hdev,
 int hl_get_current(struct hl_device *hdev,
 			int sensor_index, u32 attr, long *value)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
 
-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_CURRENT_GET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_CURRENT_GET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);
 
@@ -414,13 +414,13 @@ int hl_get_current(struct hl_device *hdev,
 int hl_get_fan_speed(struct hl_device *hdev,
 			int sensor_index, u32 attr, long *value)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
 
-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_FAN_SPEED_GET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_FAN_SPEED_GET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);
 
@@ -440,13 +440,13 @@ int hl_get_fan_speed(struct hl_device *hdev,
 int hl_get_pwm_info(struct hl_device *hdev,
 			int sensor_index, u32 attr, long *value)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
 
-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_PWM_GET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_PWM_GET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);
 
@@ -466,13 +466,13 @@ int hl_get_pwm_info(struct hl_device *hdev,
 void hl_set_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr,
 			long value)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
 
-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_PWM_SET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_PWM_SET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);
 	pkt.value = cpu_to_le64(value);
@@ -489,13 +489,13 @@ void hl_set_pwm_info(struct hl_device *hdev, int sensor_index, u32 attr,
 int hl_set_voltage(struct hl_device *hdev,
 			int sensor_index, u32 attr, long value)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
 
-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_VOLTAGE_SET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_VOLTAGE_SET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);
 	pkt.value = __cpu_to_le64(value);
@@ -514,13 +514,13 @@ int hl_set_voltage(struct hl_device *hdev,
 int hl_set_current(struct hl_device *hdev,
 			int sensor_index, u32 attr, long value)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
 
-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_CURRENT_SET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_CURRENT_SET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);
 	pkt.value = __cpu_to_le64(value);
@@ -549,7 +549,7 @@ int hl_hwmon_init(struct hl_device *hdev)
 		hdev->hl_chip_info->ops = &hl_hwmon_ops;
 
 		hdev->hwmon_dev = hwmon_device_register_with_info(dev,
-					prop->armcp_info.card_name, hdev,
+					prop->cpucp_info.card_name, hdev,
 					hdev->hl_chip_info, NULL);
 		if (IS_ERR(hdev->hwmon_dev)) {
 			rc = PTR_ERR(hdev->hwmon_dev);
diff --git a/drivers/misc/habanalabs/common/irq.c b/drivers/misc/habanalabs/common/irq.c
index c8db717023f5..de53fb5f978a 100644
--- a/drivers/misc/habanalabs/common/irq.c
+++ b/drivers/misc/habanalabs/common/irq.c
@@ -11,7 +11,7 @@
 
 /**
  * struct hl_eqe_work - This structure is used to schedule work of EQ
- *                      entry and armcp_reset event
+ *                      entry and cpucp_reset event
  *
  * @eq_work:          workqueue object to run when EQ entry is received
  * @hdev:             pointer to device structure
@@ -90,7 +90,7 @@ irqreturn_t hl_irq_handler_cq(int irq, void *arg)
 		return IRQ_HANDLED;
 	}
 
-	cq_base = (struct hl_cq_entry *) (uintptr_t) cq->kernel_address;
+	cq_base = cq->kernel_address;
 
 	while (1) {
 		bool entry_ready = ((le32_to_cpu(cq_base[cq->ci].data) &
@@ -152,7 +152,7 @@ irqreturn_t hl_irq_handler_eq(int irq, void *arg)
 	struct hl_eq_entry *eq_base;
 	struct hl_eqe_work *handle_eqe_work;
 
-	eq_base = (struct hl_eq_entry *) (uintptr_t) eq->kernel_address;
+	eq_base = eq->kernel_address;
 
 	while (1) {
 		bool entry_ready =
@@ -221,7 +221,7 @@ int hl_cq_init(struct hl_device *hdev, struct hl_cq *q, u32 hw_queue_id)
 		return -ENOMEM;
 
 	q->hdev = hdev;
-	q->kernel_address = (u64) (uintptr_t) p;
+	q->kernel_address = p;
 	q->hw_queue_id = hw_queue_id;
 	q->ci = 0;
 	q->pi = 0;
@@ -242,7 +242,8 @@ int hl_cq_init(struct hl_device *hdev, struct hl_cq *q, u32 hw_queue_id)
 void hl_cq_fini(struct hl_device *hdev, struct hl_cq *q)
 {
 	hdev->asic_funcs->asic_dma_free_coherent(hdev, HL_CQ_SIZE_IN_BYTES,
-			(void *) (uintptr_t) q->kernel_address, q->bus_address);
+						 q->kernel_address,
+						 q->bus_address);
 }
 
 void hl_cq_reset(struct hl_device *hdev, struct hl_cq *q)
@@ -259,7 +260,7 @@ void hl_cq_reset(struct hl_device *hdev, struct hl_cq *q)
 	 * when the device is operational again
 	 */
 
-	memset((void *) (uintptr_t) q->kernel_address, 0, HL_CQ_SIZE_IN_BYTES);
+	memset(q->kernel_address, 0, HL_CQ_SIZE_IN_BYTES);
 }
 
 /**
@@ -282,7 +283,7 @@ int hl_eq_init(struct hl_device *hdev, struct hl_eq *q)
 		return -ENOMEM;
 
 	q->hdev = hdev;
-	q->kernel_address = (u64) (uintptr_t) p;
+	q->kernel_address = p;
 	q->ci = 0;
 
 	return 0;
@@ -302,7 +303,7 @@ void hl_eq_fini(struct hl_device *hdev, struct hl_eq *q)
 
 	hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev,
 					HL_EQ_SIZE_IN_BYTES,
-					(void *) (uintptr_t) q->kernel_address);
+					q->kernel_address);
 }
 
 void hl_eq_reset(struct hl_device *hdev, struct hl_eq *q)
@@ -316,5 +317,5 @@ void hl_eq_reset(struct hl_device *hdev, struct hl_eq *q)
 	 * when the device is operational again
 	 */
 
-	memset((void *) (uintptr_t) q->kernel_address, 0, HL_EQ_SIZE_IN_BYTES);
+	memset(q->kernel_address, 0, HL_EQ_SIZE_IN_BYTES);
 }
diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c
index dce9273e557a..84227819e4d1 100644
--- a/drivers/misc/habanalabs/common/memory.c
+++ b/drivers/misc/habanalabs/common/memory.c
@@ -66,14 +66,19 @@ static int alloc_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args,
 	num_pgs = (args->alloc.mem_size + (page_size - 1)) >> page_shift;
 	total_size = num_pgs << page_shift;
 
+	if (!total_size) {
+		dev_err(hdev->dev, "Cannot allocate 0 bytes\n");
+		return -EINVAL;
+	}
+
 	contiguous = args->flags & HL_MEM_CONTIGUOUS;
 
 	if (contiguous) {
 		paddr = (u64) gen_pool_alloc(vm->dram_pg_pool, total_size);
 		if (!paddr) {
 			dev_err(hdev->dev,
-				"failed to allocate %llu huge contiguous pages\n",
-				num_pgs);
+				"failed to allocate %llu contiguous pages with total size of %llu\n",
+				num_pgs, total_size);
 			return -ENOMEM;
 		}
 	}
@@ -93,7 +98,7 @@ static int alloc_device_memory(struct hl_ctx *ctx, struct hl_mem_in *args,
 	phys_pg_pack->contiguous = contiguous;
 
 	phys_pg_pack->pages = kvmalloc_array(num_pgs, sizeof(u64), GFP_KERNEL);
-	if (!phys_pg_pack->pages) {
+	if (ZERO_OR_NULL_PTR(phys_pg_pack->pages)) {
 		rc = -ENOMEM;
 		goto pages_arr_err;
 	}
@@ -500,41 +505,32 @@ static inline int add_va_block(struct hl_device *hdev,
 }
 
 /*
- * get_va_block - get a virtual block with the requested size
- *
- * @hdev            : pointer to the habanalabs device structure
- * @va_range        : pointer to the virtual addresses range
- * @size            : requested block size
- * @hint_addr       : hint for request address by the user
- * @is_userptr      : is host or DRAM memory
+ * get_va_block() - get a virtual block for the given size and alignment.
+ * @hdev: pointer to the habanalabs device structure.
+ * @va_range: pointer to the virtual addresses range.
+ * @size: requested block size.
+ * @hint_addr: hint for requested address by the user.
+ * @va_block_align: required alignment of the virtual block start address.
  *
  * This function does the following:
  * - Iterate on the virtual block list to find a suitable virtual block for the
- *   requested size
- * - Reserve the requested block and update the list
- * - Return the start address of the virtual block
+ *   given size and alignment.
+ * - Reserve the requested block and update the list.
+ * - Return the start address of the virtual block.
  */
-static u64 get_va_block(struct hl_device *hdev,
-			struct hl_va_range *va_range, u64 size, u64 hint_addr,
-			bool is_userptr)
+static u64 get_va_block(struct hl_device *hdev, struct hl_va_range *va_range,
+			u64 size, u64 hint_addr, u32 va_block_align)
 {
 	struct hl_vm_va_block *va_block, *new_va_block = NULL;
-	u64 valid_start, valid_size, prev_start, prev_end, page_mask,
+	u64 valid_start, valid_size, prev_start, prev_end, align_mask,
 		res_valid_start = 0, res_valid_size = 0;
-	u32 page_size;
 	bool add_prev = false;
 
-	if (is_userptr)
-		/*
-		 * We cannot know if the user allocated memory with huge pages
-		 * or not, hence we continue with the biggest possible
-		 * granularity.
-		 */
-		page_size = hdev->asic_prop.pmmu_huge.page_size;
-	else
-		page_size = hdev->asic_prop.dmmu.page_size;
+	align_mask = ~((u64)va_block_align - 1);
 
-	page_mask = ~((u64)page_size - 1);
+	/* check if hint_addr is aligned */
+	if (hint_addr & (va_block_align - 1))
+		hint_addr = 0;
 
 	mutex_lock(&va_range->lock);
 
@@ -544,9 +540,9 @@ static u64 get_va_block(struct hl_device *hdev,
 		/* calc the first possible aligned addr */
 		valid_start = va_block->start;
 
-		if (valid_start & (page_size - 1)) {
-			valid_start &= page_mask;
-			valid_start += page_size;
+		if (valid_start & (va_block_align - 1)) {
+			valid_start &= align_mask;
+			valid_start += va_block_align;
 			if (valid_start > va_block->end)
 				continue;
 		}
@@ -683,7 +679,7 @@ static int init_phys_pg_pack_from_userptr(struct hl_ctx *ctx,
 
 	phys_pg_pack->pages = kvmalloc_array(total_npages, sizeof(u64),
 						GFP_KERNEL);
-	if (!phys_pg_pack->pages) {
+	if (ZERO_OR_NULL_PTR(phys_pg_pack->pages)) {
 		rc = -ENOMEM;
 		goto page_pack_arr_mem_err;
 	}
@@ -858,7 +854,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 	struct hl_va_range *va_range;
 	enum vm_type_t *vm_type;
 	u64 ret_vaddr, hint_addr;
-	u32 handle = 0;
+	u32 handle = 0, va_block_align;
 	int rc;
 	bool is_userptr = args->flags & HL_MEM_USERPTR;
 
@@ -868,6 +864,8 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 	if (is_userptr) {
 		u64 addr = args->map_host.host_virt_addr,
 			size = args->map_host.mem_size;
+		u32 page_size = hdev->asic_prop.pmmu.page_size,
+			huge_page_size = hdev->asic_prop.pmmu_huge.page_size;
 
 		rc = dma_map_host_va(hdev, addr, size, &userptr);
 		if (rc) {
@@ -887,6 +885,27 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 		vm_type = (enum vm_type_t *) userptr;
 		hint_addr = args->map_host.hint_addr;
 		handle = phys_pg_pack->handle;
+
+		/* get required alignment */
+		if (phys_pg_pack->page_size == page_size) {
+			va_range = ctx->host_va_range;
+
+			/*
+			 * huge page alignment may be needed in case of regular
+			 * page mapping, depending on the host VA alignment
+			 */
+			if (addr & (huge_page_size - 1))
+				va_block_align = page_size;
+			else
+				va_block_align = huge_page_size;
+		} else {
+			/*
+			 * huge page alignment is needed in case of huge page
+			 * mapping
+			 */
+			va_range = ctx->host_huge_va_range;
+			va_block_align = huge_page_size;
+		}
 	} else {
 		handle = lower_32_bits(args->map_device.handle);
 
@@ -907,6 +926,10 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 		vm_type = (enum vm_type_t *) phys_pg_pack;
 
 		hint_addr = args->map_device.hint_addr;
+
+		/* DRAM VA alignment is the same as the DRAM page size */
+		va_range = ctx->dram_va_range;
+		va_block_align = hdev->asic_prop.dmmu.page_size;
 	}
 
 	/*
@@ -928,16 +951,8 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 		goto hnode_err;
 	}
 
-	if (is_userptr)
-		if (phys_pg_pack->page_size == hdev->asic_prop.pmmu.page_size)
-			va_range = ctx->host_va_range;
-		else
-			va_range = ctx->host_huge_va_range;
-	else
-		va_range = ctx->dram_va_range;
-
 	ret_vaddr = get_va_block(hdev, va_range, phys_pg_pack->total_size,
-					hint_addr, is_userptr);
+					hint_addr, va_block_align);
 	if (!ret_vaddr) {
 		dev_err(hdev->dev, "no available va block for handle %u\n",
 				handle);
diff --git a/drivers/misc/habanalabs/common/mmu.c b/drivers/misc/habanalabs/common/mmu.c
index edcc11d5eaf1..b5058798aeb9 100644
--- a/drivers/misc/habanalabs/common/mmu.c
+++ b/drivers/misc/habanalabs/common/mmu.c
@@ -1,258 +1,13 @@
 // SPDX-License-Identifier: GPL-2.0
 
 /*
- * Copyright 2016-2019 HabanaLabs, Ltd.
+ * Copyright 2016-2020 HabanaLabs, Ltd.
  * All Rights Reserved.
  */
 
-#include "habanalabs.h"
-#include "../include/hw_ip/mmu/mmu_general.h"
-
-#include <linux/genalloc.h>
 #include <linux/slab.h>
 
-static inline u64 get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr);
-
-static struct pgt_info *get_pgt_info(struct hl_ctx *ctx, u64 hop_addr)
-{
-	struct pgt_info *pgt_info = NULL;
-
-	hash_for_each_possible(ctx->mmu_shadow_hash, pgt_info, node,
-				(unsigned long) hop_addr)
-		if (hop_addr == pgt_info->shadow_addr)
-			break;
-
-	return pgt_info;
-}
-
-static void _free_hop(struct hl_ctx *ctx, struct pgt_info *pgt_info)
-{
-	struct hl_device *hdev = ctx->hdev;
-
-	gen_pool_free(hdev->mmu_pgt_pool, pgt_info->phys_addr,
-			hdev->asic_prop.mmu_hop_table_size);
-	hash_del(&pgt_info->node);
-	kfree((u64 *) (uintptr_t) pgt_info->shadow_addr);
-	kfree(pgt_info);
-}
-
-static void free_hop(struct hl_ctx *ctx, u64 hop_addr)
-{
-	struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr);
-
-	_free_hop(ctx, pgt_info);
-}
-
-static u64 alloc_hop(struct hl_ctx *ctx)
-{
-	struct hl_device *hdev = ctx->hdev;
-	struct asic_fixed_properties *prop = &hdev->asic_prop;
-	struct pgt_info *pgt_info;
-	u64 phys_addr, shadow_addr;
-
-	pgt_info = kmalloc(sizeof(*pgt_info), GFP_KERNEL);
-	if (!pgt_info)
-		return ULLONG_MAX;
-
-	phys_addr = (u64) gen_pool_alloc(hdev->mmu_pgt_pool,
-					prop->mmu_hop_table_size);
-	if (!phys_addr) {
-		dev_err(hdev->dev, "failed to allocate page\n");
-		goto pool_add_err;
-	}
-
-	shadow_addr = (u64) (uintptr_t) kzalloc(prop->mmu_hop_table_size,
-						GFP_KERNEL);
-	if (!shadow_addr)
-		goto shadow_err;
-
-	pgt_info->phys_addr = phys_addr;
-	pgt_info->shadow_addr = shadow_addr;
-	pgt_info->ctx = ctx;
-	pgt_info->num_of_ptes = 0;
-	hash_add(ctx->mmu_shadow_hash, &pgt_info->node, shadow_addr);
-
-	return shadow_addr;
-
-shadow_err:
-	gen_pool_free(hdev->mmu_pgt_pool, phys_addr, prop->mmu_hop_table_size);
-pool_add_err:
-	kfree(pgt_info);
-
-	return ULLONG_MAX;
-}
-
-static inline u64 get_phys_hop0_addr(struct hl_ctx *ctx)
-{
-	return ctx->hdev->asic_prop.mmu_pgt_addr +
-			(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
-}
-
-static inline u64 get_hop0_addr(struct hl_ctx *ctx)
-{
-	return (u64) (uintptr_t) ctx->hdev->mmu_shadow_hop0 +
-			(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
-}
-
-static inline void flush(struct hl_ctx *ctx)
-{
-	/* flush all writes from all cores to reach PCI */
-	mb();
-	ctx->hdev->asic_funcs->read_pte(ctx->hdev, get_phys_hop0_addr(ctx));
-}
-
-/* transform the value to physical address when writing to H/W */
-static inline void write_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val)
-{
-	/*
-	 * The value to write is actually the address of the next shadow hop +
-	 * flags at the 12 LSBs.
-	 * Hence in order to get the value to write to the physical PTE, we
-	 * clear the 12 LSBs and translate the shadow hop to its associated
-	 * physical hop, and add back the original 12 LSBs.
-	 */
-	u64 phys_val = get_phys_addr(ctx, val & HOP_PHYS_ADDR_MASK) |
-				(val & FLAGS_MASK);
-
-	ctx->hdev->asic_funcs->write_pte(ctx->hdev,
-					get_phys_addr(ctx, shadow_pte_addr),
-					phys_val);
-
-	*(u64 *) (uintptr_t) shadow_pte_addr = val;
-}
-
-/* do not transform the value to physical address when writing to H/W */
-static inline void write_final_pte(struct hl_ctx *ctx, u64 shadow_pte_addr,
-					u64 val)
-{
-	ctx->hdev->asic_funcs->write_pte(ctx->hdev,
-					get_phys_addr(ctx, shadow_pte_addr),
-					val);
-	*(u64 *) (uintptr_t) shadow_pte_addr = val;
-}
-
-/* clear the last and present bits */
-static inline void clear_pte(struct hl_ctx *ctx, u64 pte_addr)
-{
-	/* no need to transform the value to physical address */
-	write_final_pte(ctx, pte_addr, 0);
-}
-
-static inline void get_pte(struct hl_ctx *ctx, u64 hop_addr)
-{
-	get_pgt_info(ctx, hop_addr)->num_of_ptes++;
-}
-
-/*
- * put_pte - decrement the num of ptes and free the hop if possible
- *
- * @ctx: pointer to the context structure
- * @hop_addr: addr of the hop
- *
- * This function returns the number of ptes left on this hop. If the number is
- * 0, it means the pte was freed.
- */
-static inline int put_pte(struct hl_ctx *ctx, u64 hop_addr)
-{
-	struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr);
-	int num_of_ptes_left;
-
-	pgt_info->num_of_ptes--;
-
-	/*
-	 * Need to save the number of ptes left because free_hop might free
-	 * the pgt_info
-	 */
-	num_of_ptes_left = pgt_info->num_of_ptes;
-	if (!num_of_ptes_left)
-		_free_hop(ctx, pgt_info);
-
-	return num_of_ptes_left;
-}
-
-static inline u64 get_hopN_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
-					u64 virt_addr, u64 mask, u64 shift)
-{
-	return hop_addr + ctx->hdev->asic_prop.mmu_pte_size *
-			((virt_addr & mask) >> shift);
-}
-
-static inline u64 get_hop0_pte_addr(struct hl_ctx *ctx,
-					struct hl_mmu_properties *mmu_prop,
-					u64 hop_addr, u64 vaddr)
-{
-	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop0_mask,
-					mmu_prop->hop0_shift);
-}
-
-static inline u64 get_hop1_pte_addr(struct hl_ctx *ctx,
-					struct hl_mmu_properties *mmu_prop,
-					u64 hop_addr, u64 vaddr)
-{
-	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop1_mask,
-					mmu_prop->hop1_shift);
-}
-
-static inline u64 get_hop2_pte_addr(struct hl_ctx *ctx,
-					struct hl_mmu_properties *mmu_prop,
-					u64 hop_addr, u64 vaddr)
-{
-	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop2_mask,
-					mmu_prop->hop2_shift);
-}
-
-static inline u64 get_hop3_pte_addr(struct hl_ctx *ctx,
-					struct hl_mmu_properties *mmu_prop,
-					u64 hop_addr, u64 vaddr)
-{
-	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop3_mask,
-					mmu_prop->hop3_shift);
-}
-
-static inline u64 get_hop4_pte_addr(struct hl_ctx *ctx,
-					struct hl_mmu_properties *mmu_prop,
-					u64 hop_addr, u64 vaddr)
-{
-	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop4_mask,
-					mmu_prop->hop4_shift);
-}
-
-static inline u64 get_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte)
-{
-	if (curr_pte & PAGE_PRESENT_MASK)
-		return curr_pte & HOP_PHYS_ADDR_MASK;
-	else
-		return ULLONG_MAX;
-}
-
-static inline u64 get_alloc_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte,
-						bool *is_new_hop)
-{
-	u64 hop_addr = get_next_hop_addr(ctx, curr_pte);
-
-	if (hop_addr == ULLONG_MAX) {
-		hop_addr = alloc_hop(ctx);
-		*is_new_hop = (hop_addr != ULLONG_MAX);
-	}
-
-	return hop_addr;
-}
-
-/* translates shadow address inside hop to a physical address */
-static inline u64 get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr)
-{
-	u64 page_mask = (ctx->hdev->asic_prop.mmu_hop_table_size - 1);
-	u64 shadow_hop_addr = shadow_addr & ~page_mask;
-	u64 pte_offset = shadow_addr & page_mask;
-	u64 phys_hop_addr;
-
-	if (shadow_hop_addr != get_hop0_addr(ctx))
-		phys_hop_addr = get_pgt_info(ctx, shadow_hop_addr)->phys_addr;
-	else
-		phys_hop_addr = get_phys_hop0_addr(ctx);
-
-	return phys_hop_addr + pte_offset;
-}
+#include "habanalabs.h"
 
 static bool is_dram_va(struct hl_device *hdev, u64 virt_addr)
 {
@@ -263,155 +18,6 @@ static bool is_dram_va(struct hl_device *hdev, u64 virt_addr)
 					prop->dmmu.end_addr);
 }
 
-static int dram_default_mapping_init(struct hl_ctx *ctx)
-{
-	struct hl_device *hdev = ctx->hdev;
-	struct asic_fixed_properties *prop = &hdev->asic_prop;
-	u64 num_of_hop3, total_hops, hop0_addr, hop1_addr, hop2_addr,
-		hop2_pte_addr, hop3_pte_addr, pte_val;
-	int rc, i, j, hop3_allocated = 0;
-
-	if ((!hdev->dram_supports_virtual_memory) ||
-			(!hdev->dram_default_page_mapping) ||
-			(ctx->asid == HL_KERNEL_ASID_ID))
-		return 0;
-
-	num_of_hop3 = prop->dram_size_for_default_page_mapping;
-	do_div(num_of_hop3, prop->dram_page_size);
-	do_div(num_of_hop3, PTE_ENTRIES_IN_HOP);
-
-	/* add hop1 and hop2 */
-	total_hops = num_of_hop3 + 2;
-
-	ctx->dram_default_hops = kzalloc(HL_PTE_SIZE * total_hops,  GFP_KERNEL);
-	if (!ctx->dram_default_hops)
-		return -ENOMEM;
-
-	hop0_addr = get_hop0_addr(ctx);
-
-	hop1_addr = alloc_hop(ctx);
-	if (hop1_addr == ULLONG_MAX) {
-		dev_err(hdev->dev, "failed to alloc hop 1\n");
-		rc = -ENOMEM;
-		goto hop1_err;
-	}
-
-	ctx->dram_default_hops[total_hops - 1] = hop1_addr;
-
-	hop2_addr = alloc_hop(ctx);
-	if (hop2_addr == ULLONG_MAX) {
-		dev_err(hdev->dev, "failed to alloc hop 2\n");
-		rc = -ENOMEM;
-		goto hop2_err;
-	}
-
-	ctx->dram_default_hops[total_hops - 2] = hop2_addr;
-
-	for (i = 0 ; i < num_of_hop3 ; i++) {
-		ctx->dram_default_hops[i] = alloc_hop(ctx);
-		if (ctx->dram_default_hops[i] == ULLONG_MAX) {
-			dev_err(hdev->dev, "failed to alloc hop 3, i: %d\n", i);
-			rc = -ENOMEM;
-			goto hop3_err;
-		}
-		hop3_allocated++;
-	}
-
-	/* need only pte 0 in hops 0 and 1 */
-	pte_val = (hop1_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
-	write_pte(ctx, hop0_addr, pte_val);
-
-	pte_val = (hop2_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
-	write_pte(ctx, hop1_addr, pte_val);
-	get_pte(ctx, hop1_addr);
-
-	hop2_pte_addr = hop2_addr;
-	for (i = 0 ; i < num_of_hop3 ; i++) {
-		pte_val = (ctx->dram_default_hops[i] & HOP_PHYS_ADDR_MASK) |
-				PAGE_PRESENT_MASK;
-		write_pte(ctx, hop2_pte_addr, pte_val);
-		get_pte(ctx, hop2_addr);
-		hop2_pte_addr += HL_PTE_SIZE;
-	}
-
-	pte_val = (prop->mmu_dram_default_page_addr & HOP_PHYS_ADDR_MASK) |
-			LAST_MASK | PAGE_PRESENT_MASK;
-
-	for (i = 0 ; i < num_of_hop3 ; i++) {
-		hop3_pte_addr = ctx->dram_default_hops[i];
-		for (j = 0 ; j < PTE_ENTRIES_IN_HOP ; j++) {
-			write_final_pte(ctx, hop3_pte_addr, pte_val);
-			get_pte(ctx, ctx->dram_default_hops[i]);
-			hop3_pte_addr += HL_PTE_SIZE;
-		}
-	}
-
-	flush(ctx);
-
-	return 0;
-
-hop3_err:
-	for (i = 0 ; i < hop3_allocated ; i++)
-		free_hop(ctx, ctx->dram_default_hops[i]);
-
-	free_hop(ctx, hop2_addr);
-hop2_err:
-	free_hop(ctx, hop1_addr);
-hop1_err:
-	kfree(ctx->dram_default_hops);
-
-	return rc;
-}
-
-static void dram_default_mapping_fini(struct hl_ctx *ctx)
-{
-	struct hl_device *hdev = ctx->hdev;
-	struct asic_fixed_properties *prop = &hdev->asic_prop;
-	u64 num_of_hop3, total_hops, hop0_addr, hop1_addr, hop2_addr,
-		hop2_pte_addr, hop3_pte_addr;
-	int i, j;
-
-	if ((!hdev->dram_supports_virtual_memory) ||
-			(!hdev->dram_default_page_mapping) ||
-			(ctx->asid == HL_KERNEL_ASID_ID))
-		return;
-
-	num_of_hop3 = prop->dram_size_for_default_page_mapping;
-	do_div(num_of_hop3, prop->dram_page_size);
-	do_div(num_of_hop3, PTE_ENTRIES_IN_HOP);
-
-	hop0_addr = get_hop0_addr(ctx);
-	/* add hop1 and hop2 */
-	total_hops = num_of_hop3 + 2;
-	hop1_addr = ctx->dram_default_hops[total_hops - 1];
-	hop2_addr = ctx->dram_default_hops[total_hops - 2];
-
-	for (i = 0 ; i < num_of_hop3 ; i++) {
-		hop3_pte_addr = ctx->dram_default_hops[i];
-		for (j = 0 ; j < PTE_ENTRIES_IN_HOP ; j++) {
-			clear_pte(ctx, hop3_pte_addr);
-			put_pte(ctx, ctx->dram_default_hops[i]);
-			hop3_pte_addr += HL_PTE_SIZE;
-		}
-	}
-
-	hop2_pte_addr = hop2_addr;
-	hop2_pte_addr = hop2_addr;
-	for (i = 0 ; i < num_of_hop3 ; i++) {
-		clear_pte(ctx, hop2_pte_addr);
-		put_pte(ctx, hop2_addr);
-		hop2_pte_addr += HL_PTE_SIZE;
-	}
-
-	clear_pte(ctx, hop1_addr);
-	put_pte(ctx, hop1_addr);
-	clear_pte(ctx, hop0_addr);
-
-	kfree(ctx->dram_default_hops);
-
-	flush(ctx);
-}
-
 /**
  * hl_mmu_init() - initialize the MMU module.
  * @hdev: habanalabs device structure.
@@ -424,45 +30,10 @@ static void dram_default_mapping_fini(struct hl_ctx *ctx)
  */
 int hl_mmu_init(struct hl_device *hdev)
 {
-	struct asic_fixed_properties *prop = &hdev->asic_prop;
-	int rc;
-
-	if (!hdev->mmu_enable)
-		return 0;
-
-	hdev->mmu_pgt_pool =
-			gen_pool_create(__ffs(prop->mmu_hop_table_size), -1);
-
-	if (!hdev->mmu_pgt_pool) {
-		dev_err(hdev->dev, "Failed to create page gen pool\n");
-		return -ENOMEM;
-	}
-
-	rc = gen_pool_add(hdev->mmu_pgt_pool, prop->mmu_pgt_addr +
-			prop->mmu_hop0_tables_total_size,
-			prop->mmu_pgt_size - prop->mmu_hop0_tables_total_size,
-			-1);
-	if (rc) {
-		dev_err(hdev->dev, "Failed to add memory to page gen pool\n");
-		goto err_pool_add;
-	}
-
-	hdev->mmu_shadow_hop0 = kvmalloc_array(prop->max_asid,
-					prop->mmu_hop_table_size,
-					GFP_KERNEL | __GFP_ZERO);
-	if (!hdev->mmu_shadow_hop0) {
-		rc = -ENOMEM;
-		goto err_pool_add;
-	}
-
-	/* MMU H/W init will be done in device hw_init() */
+	if (hdev->mmu_enable)
+		return hdev->mmu_func.init(hdev);
 
 	return 0;
-
-err_pool_add:
-	gen_pool_destroy(hdev->mmu_pgt_pool);
-
-	return rc;
 }
 
 /**
@@ -477,13 +48,8 @@ err_pool_add:
  */
 void hl_mmu_fini(struct hl_device *hdev)
 {
-	if (!hdev->mmu_enable)
-		return;
-
-	/* MMU H/W fini was already done in device hw_fini() */
-
-	kvfree(hdev->mmu_shadow_hop0);
-	gen_pool_destroy(hdev->mmu_pgt_pool);
+	if (hdev->mmu_enable)
+		hdev->mmu_func.fini(hdev);
 }
 
 /**
@@ -498,13 +64,10 @@ int hl_mmu_ctx_init(struct hl_ctx *ctx)
 {
 	struct hl_device *hdev = ctx->hdev;
 
-	if (!hdev->mmu_enable)
-		return 0;
+	if (hdev->mmu_enable)
+		return hdev->mmu_func.ctx_init(ctx);
 
-	mutex_init(&ctx->mmu_lock);
-	hash_init(ctx->mmu_shadow_hash);
-
-	return dram_default_mapping_init(ctx);
+	return 0;
 }
 
 /*
@@ -520,160 +83,9 @@ int hl_mmu_ctx_init(struct hl_ctx *ctx)
 void hl_mmu_ctx_fini(struct hl_ctx *ctx)
 {
 	struct hl_device *hdev = ctx->hdev;
-	struct pgt_info *pgt_info;
-	struct hlist_node *tmp;
-	int i;
-
-	if (!hdev->mmu_enable)
-		return;
-
-	dram_default_mapping_fini(ctx);
-
-	if (!hash_empty(ctx->mmu_shadow_hash))
-		dev_err(hdev->dev, "ctx %d is freed while it has pgts in use\n",
-			ctx->asid);
-
-	hash_for_each_safe(ctx->mmu_shadow_hash, i, tmp, pgt_info, node) {
-		dev_err_ratelimited(hdev->dev,
-			"pgt_info of addr 0x%llx of asid %d was not destroyed, num_ptes: %d\n",
-			pgt_info->phys_addr, ctx->asid, pgt_info->num_of_ptes);
-		_free_hop(ctx, pgt_info);
-	}
-
-	mutex_destroy(&ctx->mmu_lock);
-}
-
-static int _hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, bool is_dram_addr)
-{
-	struct hl_device *hdev = ctx->hdev;
-	struct asic_fixed_properties *prop = &hdev->asic_prop;
-	struct hl_mmu_properties *mmu_prop;
-	u64 hop0_addr = 0, hop0_pte_addr = 0,
-		hop1_addr = 0, hop1_pte_addr = 0,
-		hop2_addr = 0, hop2_pte_addr = 0,
-		hop3_addr = 0, hop3_pte_addr = 0,
-		hop4_addr = 0, hop4_pte_addr = 0,
-		curr_pte;
-	bool is_huge, clear_hop3 = true;
-
-	/* shifts and masks are the same in PMMU and HPMMU, use one of them */
-	mmu_prop = is_dram_addr ? &prop->dmmu : &prop->pmmu;
-
-	hop0_addr = get_hop0_addr(ctx);
-	hop0_pte_addr = get_hop0_pte_addr(ctx, mmu_prop, hop0_addr, virt_addr);
-
-	curr_pte = *(u64 *) (uintptr_t) hop0_pte_addr;
-
-	hop1_addr = get_next_hop_addr(ctx, curr_pte);
-
-	if (hop1_addr == ULLONG_MAX)
-		goto not_mapped;
-
-	hop1_pte_addr = get_hop1_pte_addr(ctx, mmu_prop, hop1_addr, virt_addr);
-
-	curr_pte = *(u64 *) (uintptr_t) hop1_pte_addr;
-
-	hop2_addr = get_next_hop_addr(ctx, curr_pte);
-
-	if (hop2_addr == ULLONG_MAX)
-		goto not_mapped;
-
-	hop2_pte_addr = get_hop2_pte_addr(ctx, mmu_prop, hop2_addr, virt_addr);
-
-	curr_pte = *(u64 *) (uintptr_t) hop2_pte_addr;
-
-	hop3_addr = get_next_hop_addr(ctx, curr_pte);
-
-	if (hop3_addr == ULLONG_MAX)
-		goto not_mapped;
-
-	hop3_pte_addr = get_hop3_pte_addr(ctx, mmu_prop, hop3_addr, virt_addr);
-
-	curr_pte = *(u64 *) (uintptr_t) hop3_pte_addr;
-
-	is_huge = curr_pte & LAST_MASK;
-
-	if (is_dram_addr && !is_huge) {
-		dev_err(hdev->dev,
-				"DRAM unmapping should use huge pages only\n");
-		return -EFAULT;
-	}
-
-	if (!is_huge) {
-		hop4_addr = get_next_hop_addr(ctx, curr_pte);
-
-		if (hop4_addr == ULLONG_MAX)
-			goto not_mapped;
-
-		hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop, hop4_addr,
-							virt_addr);
-
-		curr_pte = *(u64 *) (uintptr_t) hop4_pte_addr;
-
-		clear_hop3 = false;
-	}
-
-	if (hdev->dram_default_page_mapping && is_dram_addr) {
-		u64 default_pte = (prop->mmu_dram_default_page_addr &
-				HOP_PHYS_ADDR_MASK) | LAST_MASK |
-					PAGE_PRESENT_MASK;
-		if (curr_pte == default_pte) {
-			dev_err(hdev->dev,
-				"DRAM: hop3 PTE points to zero page, can't unmap, va: 0x%llx\n",
-					virt_addr);
-			goto not_mapped;
-		}
-
-		if (!(curr_pte & PAGE_PRESENT_MASK)) {
-			dev_err(hdev->dev,
-				"DRAM: hop3 PTE is cleared! can't unmap, va: 0x%llx\n",
-					virt_addr);
-			goto not_mapped;
-		}
-
-		write_final_pte(ctx, hop3_pte_addr, default_pte);
-		put_pte(ctx, hop3_addr);
-	} else {
-		if (!(curr_pte & PAGE_PRESENT_MASK))
-			goto not_mapped;
-
-		if (hop4_addr)
-			clear_pte(ctx, hop4_pte_addr);
-		else
-			clear_pte(ctx, hop3_pte_addr);
-
-		if (hop4_addr && !put_pte(ctx, hop4_addr))
-			clear_hop3 = true;
-
-		if (!clear_hop3)
-			goto mapped;
-
-		clear_pte(ctx, hop3_pte_addr);
 
-		if (put_pte(ctx, hop3_addr))
-			goto mapped;
-
-		clear_pte(ctx, hop2_pte_addr);
-
-		if (put_pte(ctx, hop2_addr))
-			goto mapped;
-
-		clear_pte(ctx, hop1_pte_addr);
-
-		if (put_pte(ctx, hop1_addr))
-			goto mapped;
-
-		clear_pte(ctx, hop0_pte_addr);
-	}
-
-mapped:
-	return 0;
-
-not_mapped:
-	dev_err(hdev->dev, "virt addr 0x%llx is not mapped to phys addr\n",
-		virt_addr);
-
-	return -EINVAL;
+	if (hdev->mmu_enable)
+		hdev->mmu_func.ctx_fini(ctx);
 }
 
 /*
@@ -738,7 +150,7 @@ int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size,
 	real_virt_addr = virt_addr;
 
 	for (i = 0 ; i < npages ; i++) {
-		rc = _hl_mmu_unmap(ctx, real_virt_addr, is_dram_addr);
+		rc = hdev->mmu_func.unmap(ctx, real_virt_addr, is_dram_addr);
 		if (rc)
 			break;
 
@@ -746,172 +158,7 @@ int hl_mmu_unmap(struct hl_ctx *ctx, u64 virt_addr, u32 page_size,
 	}
 
 	if (flush_pte)
-		flush(ctx);
-
-	return rc;
-}
-
-static int _hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
-			u32 page_size, bool is_dram_addr)
-{
-	struct hl_device *hdev = ctx->hdev;
-	struct asic_fixed_properties *prop = &hdev->asic_prop;
-	struct hl_mmu_properties *mmu_prop;
-	u64 hop0_addr = 0, hop0_pte_addr = 0,
-		hop1_addr = 0, hop1_pte_addr = 0,
-		hop2_addr = 0, hop2_pte_addr = 0,
-		hop3_addr = 0, hop3_pte_addr = 0,
-		hop4_addr = 0, hop4_pte_addr = 0,
-		curr_pte = 0;
-	bool hop1_new = false, hop2_new = false, hop3_new = false,
-		hop4_new = false, is_huge;
-	int rc = -ENOMEM;
-
-	/*
-	 * This mapping function can map a page or a huge page. For huge page
-	 * there are only 3 hops rather than 4. Currently the DRAM allocation
-	 * uses huge pages only but user memory could have been allocated with
-	 * one of the two page sizes. Since this is a common code for all the
-	 * three cases, we need this hugs page check.
-	 */
-	if (is_dram_addr) {
-		mmu_prop = &prop->dmmu;
-		is_huge = true;
-	} else if (page_size == prop->pmmu_huge.page_size) {
-		mmu_prop = &prop->pmmu_huge;
-		is_huge = true;
-	} else {
-		mmu_prop = &prop->pmmu;
-		is_huge = false;
-	}
-
-	hop0_addr = get_hop0_addr(ctx);
-	hop0_pte_addr = get_hop0_pte_addr(ctx, mmu_prop, hop0_addr, virt_addr);
-	curr_pte = *(u64 *) (uintptr_t) hop0_pte_addr;
-
-	hop1_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop1_new);
-	if (hop1_addr == ULLONG_MAX)
-		goto err;
-
-	hop1_pte_addr = get_hop1_pte_addr(ctx, mmu_prop, hop1_addr, virt_addr);
-	curr_pte = *(u64 *) (uintptr_t) hop1_pte_addr;
-
-	hop2_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop2_new);
-	if (hop2_addr == ULLONG_MAX)
-		goto err;
-
-	hop2_pte_addr = get_hop2_pte_addr(ctx, mmu_prop, hop2_addr, virt_addr);
-	curr_pte = *(u64 *) (uintptr_t) hop2_pte_addr;
-
-	hop3_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop3_new);
-	if (hop3_addr == ULLONG_MAX)
-		goto err;
-
-	hop3_pte_addr = get_hop3_pte_addr(ctx, mmu_prop, hop3_addr, virt_addr);
-	curr_pte = *(u64 *) (uintptr_t) hop3_pte_addr;
-
-	if (!is_huge) {
-		hop4_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop4_new);
-		if (hop4_addr == ULLONG_MAX)
-			goto err;
-
-		hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop, hop4_addr,
-							virt_addr);
-		curr_pte = *(u64 *) (uintptr_t) hop4_pte_addr;
-	}
-
-	if (hdev->dram_default_page_mapping && is_dram_addr) {
-		u64 default_pte = (prop->mmu_dram_default_page_addr &
-					HOP_PHYS_ADDR_MASK) | LAST_MASK |
-						PAGE_PRESENT_MASK;
-
-		if (curr_pte != default_pte) {
-			dev_err(hdev->dev,
-				"DRAM: mapping already exists for virt_addr 0x%llx\n",
-					virt_addr);
-			rc = -EINVAL;
-			goto err;
-		}
-
-		if (hop1_new || hop2_new || hop3_new || hop4_new) {
-			dev_err(hdev->dev,
-				"DRAM mapping should not allocate more hops\n");
-			rc = -EFAULT;
-			goto err;
-		}
-	} else if (curr_pte & PAGE_PRESENT_MASK) {
-		dev_err(hdev->dev,
-			"mapping already exists for virt_addr 0x%llx\n",
-				virt_addr);
-
-		dev_dbg(hdev->dev, "hop0 pte: 0x%llx (0x%llx)\n",
-			*(u64 *) (uintptr_t) hop0_pte_addr, hop0_pte_addr);
-		dev_dbg(hdev->dev, "hop1 pte: 0x%llx (0x%llx)\n",
-			*(u64 *) (uintptr_t) hop1_pte_addr, hop1_pte_addr);
-		dev_dbg(hdev->dev, "hop2 pte: 0x%llx (0x%llx)\n",
-			*(u64 *) (uintptr_t) hop2_pte_addr, hop2_pte_addr);
-		dev_dbg(hdev->dev, "hop3 pte: 0x%llx (0x%llx)\n",
-			*(u64 *) (uintptr_t) hop3_pte_addr, hop3_pte_addr);
-
-		if (!is_huge)
-			dev_dbg(hdev->dev, "hop4 pte: 0x%llx (0x%llx)\n",
-				*(u64 *) (uintptr_t) hop4_pte_addr,
-				hop4_pte_addr);
-
-		rc = -EINVAL;
-		goto err;
-	}
-
-	curr_pte = (phys_addr & HOP_PHYS_ADDR_MASK) | LAST_MASK
-			| PAGE_PRESENT_MASK;
-
-	if (is_huge)
-		write_final_pte(ctx, hop3_pte_addr, curr_pte);
-	else
-		write_final_pte(ctx, hop4_pte_addr, curr_pte);
-
-	if (hop1_new) {
-		curr_pte =
-			(hop1_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
-		write_pte(ctx, hop0_pte_addr, curr_pte);
-	}
-	if (hop2_new) {
-		curr_pte =
-			(hop2_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
-		write_pte(ctx, hop1_pte_addr, curr_pte);
-		get_pte(ctx, hop1_addr);
-	}
-	if (hop3_new) {
-		curr_pte =
-			(hop3_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
-		write_pte(ctx, hop2_pte_addr, curr_pte);
-		get_pte(ctx, hop2_addr);
-	}
-
-	if (!is_huge) {
-		if (hop4_new) {
-			curr_pte = (hop4_addr & HOP_PHYS_ADDR_MASK) |
-					PAGE_PRESENT_MASK;
-			write_pte(ctx, hop3_pte_addr, curr_pte);
-			get_pte(ctx, hop3_addr);
-		}
-
-		get_pte(ctx, hop4_addr);
-	} else {
-		get_pte(ctx, hop3_addr);
-	}
-
-	return 0;
-
-err:
-	if (hop4_new)
-		free_hop(ctx, hop4_addr);
-	if (hop3_new)
-		free_hop(ctx, hop3_addr);
-	if (hop2_new)
-		free_hop(ctx, hop2_addr);
-	if (hop1_new)
-		free_hop(ctx, hop1_addr);
+		hdev->mmu_func.flush(ctx);
 
 	return rc;
 }
@@ -984,7 +231,7 @@ int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size,
 	real_phys_addr = phys_addr;
 
 	for (i = 0 ; i < npages ; i++) {
-		rc = _hl_mmu_map(ctx, real_virt_addr, real_phys_addr,
+		rc = hdev->mmu_func.map(ctx, real_virt_addr, real_phys_addr,
 				real_page_size, is_dram_addr);
 		if (rc)
 			goto err;
@@ -995,21 +242,21 @@ int hl_mmu_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr, u32 page_size,
 	}
 
 	if (flush_pte)
-		flush(ctx);
+		hdev->mmu_func.flush(ctx);
 
 	return 0;
 
 err:
 	real_virt_addr = virt_addr;
 	for (i = 0 ; i < mapped_cnt ; i++) {
-		if (_hl_mmu_unmap(ctx, real_virt_addr, is_dram_addr))
+		if (hdev->mmu_func.unmap(ctx, real_virt_addr, is_dram_addr))
 			dev_warn_ratelimited(hdev->dev,
 				"failed to unmap va: 0x%llx\n", real_virt_addr);
 
 		real_virt_addr += real_page_size;
 	}
 
-	flush(ctx);
+	hdev->mmu_func.flush(ctx);
 
 	return rc;
 }
@@ -1022,7 +269,10 @@ err:
  */
 void hl_mmu_swap_out(struct hl_ctx *ctx)
 {
+	struct hl_device *hdev = ctx->hdev;
 
+	if (hdev->mmu_enable)
+		hdev->mmu_func.swap_out(ctx);
 }
 
 /*
@@ -1033,5 +283,27 @@ void hl_mmu_swap_out(struct hl_ctx *ctx)
  */
 void hl_mmu_swap_in(struct hl_ctx *ctx)
 {
+	struct hl_device *hdev = ctx->hdev;
+
+	if (hdev->mmu_enable)
+		hdev->mmu_func.swap_in(ctx);
+}
+
+int hl_mmu_if_set_funcs(struct hl_device *hdev)
+{
+	if (!hdev->mmu_enable)
+		return 0;
+
+	switch (hdev->asic_type) {
+	case ASIC_GOYA:
+	case ASIC_GAUDI:
+		hl_mmu_v1_set_funcs(hdev);
+		break;
+	default:
+		dev_err(hdev->dev, "Unrecognized ASIC type %d\n",
+			hdev->asic_type);
+		return -EOPNOTSUPP;
+	}
 
+	return 0;
 }
diff --git a/drivers/misc/habanalabs/common/mmu_v1.c b/drivers/misc/habanalabs/common/mmu_v1.c
new file mode 100644
index 000000000000..8d1eb5265419
--- /dev/null
+++ b/drivers/misc/habanalabs/common/mmu_v1.c
@@ -0,0 +1,863 @@
+// SPDX-License-Identifier: GPL-2.0
+
+/*
+ * Copyright 2016-2019 HabanaLabs, Ltd.
+ * All Rights Reserved.
+ */
+
+#include "habanalabs.h"
+#include "../include/hw_ip/mmu/mmu_general.h"
+
+#include <linux/genalloc.h>
+#include <linux/slab.h>
+
+static inline u64 get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr);
+
+static struct pgt_info *get_pgt_info(struct hl_ctx *ctx, u64 hop_addr)
+{
+	struct pgt_info *pgt_info = NULL;
+
+	hash_for_each_possible(ctx->mmu_shadow_hash, pgt_info, node,
+				(unsigned long) hop_addr)
+		if (hop_addr == pgt_info->shadow_addr)
+			break;
+
+	return pgt_info;
+}
+
+static void _free_hop(struct hl_ctx *ctx, struct pgt_info *pgt_info)
+{
+	struct hl_device *hdev = ctx->hdev;
+
+	gen_pool_free(hdev->mmu_priv.mmu_pgt_pool, pgt_info->phys_addr,
+			hdev->asic_prop.mmu_hop_table_size);
+	hash_del(&pgt_info->node);
+	kfree((u64 *) (uintptr_t) pgt_info->shadow_addr);
+	kfree(pgt_info);
+}
+
+static void free_hop(struct hl_ctx *ctx, u64 hop_addr)
+{
+	struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr);
+
+	_free_hop(ctx, pgt_info);
+}
+
+static u64 alloc_hop(struct hl_ctx *ctx)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct pgt_info *pgt_info;
+	u64 phys_addr, shadow_addr;
+
+	pgt_info = kmalloc(sizeof(*pgt_info), GFP_KERNEL);
+	if (!pgt_info)
+		return ULLONG_MAX;
+
+	phys_addr = (u64) gen_pool_alloc(hdev->mmu_priv.mmu_pgt_pool,
+					prop->mmu_hop_table_size);
+	if (!phys_addr) {
+		dev_err(hdev->dev, "failed to allocate page\n");
+		goto pool_add_err;
+	}
+
+	shadow_addr = (u64) (uintptr_t) kzalloc(prop->mmu_hop_table_size,
+						GFP_KERNEL);
+	if (!shadow_addr)
+		goto shadow_err;
+
+	pgt_info->phys_addr = phys_addr;
+	pgt_info->shadow_addr = shadow_addr;
+	pgt_info->ctx = ctx;
+	pgt_info->num_of_ptes = 0;
+	hash_add(ctx->mmu_shadow_hash, &pgt_info->node, shadow_addr);
+
+	return shadow_addr;
+
+shadow_err:
+	gen_pool_free(hdev->mmu_priv.mmu_pgt_pool, phys_addr,
+			prop->mmu_hop_table_size);
+pool_add_err:
+	kfree(pgt_info);
+
+	return ULLONG_MAX;
+}
+
+static inline u64 get_phys_hop0_addr(struct hl_ctx *ctx)
+{
+	return ctx->hdev->asic_prop.mmu_pgt_addr +
+			(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
+}
+
+static inline u64 get_hop0_addr(struct hl_ctx *ctx)
+{
+	return (u64) (uintptr_t) ctx->hdev->mmu_priv.mmu_shadow_hop0 +
+			(ctx->asid * ctx->hdev->asic_prop.mmu_hop_table_size);
+}
+
+static void flush(struct hl_ctx *ctx)
+{
+	/* flush all writes from all cores to reach PCI */
+	mb();
+	ctx->hdev->asic_funcs->read_pte(ctx->hdev, get_phys_hop0_addr(ctx));
+}
+
+/* transform the value to physical address when writing to H/W */
+static inline void write_pte(struct hl_ctx *ctx, u64 shadow_pte_addr, u64 val)
+{
+	/*
+	 * The value to write is actually the address of the next shadow hop +
+	 * flags at the 12 LSBs.
+	 * Hence in order to get the value to write to the physical PTE, we
+	 * clear the 12 LSBs and translate the shadow hop to its associated
+	 * physical hop, and add back the original 12 LSBs.
+	 */
+	u64 phys_val = get_phys_addr(ctx, val & HOP_PHYS_ADDR_MASK) |
+				(val & FLAGS_MASK);
+
+	ctx->hdev->asic_funcs->write_pte(ctx->hdev,
+					get_phys_addr(ctx, shadow_pte_addr),
+					phys_val);
+
+	*(u64 *) (uintptr_t) shadow_pte_addr = val;
+}
+
+/* do not transform the value to physical address when writing to H/W */
+static inline void write_final_pte(struct hl_ctx *ctx, u64 shadow_pte_addr,
+					u64 val)
+{
+	ctx->hdev->asic_funcs->write_pte(ctx->hdev,
+					get_phys_addr(ctx, shadow_pte_addr),
+					val);
+	*(u64 *) (uintptr_t) shadow_pte_addr = val;
+}
+
+/* clear the last and present bits */
+static inline void clear_pte(struct hl_ctx *ctx, u64 pte_addr)
+{
+	/* no need to transform the value to physical address */
+	write_final_pte(ctx, pte_addr, 0);
+}
+
+static inline void get_pte(struct hl_ctx *ctx, u64 hop_addr)
+{
+	get_pgt_info(ctx, hop_addr)->num_of_ptes++;
+}
+
+/*
+ * put_pte - decrement the num of ptes and free the hop if possible
+ *
+ * @ctx: pointer to the context structure
+ * @hop_addr: addr of the hop
+ *
+ * This function returns the number of ptes left on this hop. If the number is
+ * 0, it means the pte was freed.
+ */
+static inline int put_pte(struct hl_ctx *ctx, u64 hop_addr)
+{
+	struct pgt_info *pgt_info = get_pgt_info(ctx, hop_addr);
+	int num_of_ptes_left;
+
+	pgt_info->num_of_ptes--;
+
+	/*
+	 * Need to save the number of ptes left because free_hop might free
+	 * the pgt_info
+	 */
+	num_of_ptes_left = pgt_info->num_of_ptes;
+	if (!num_of_ptes_left)
+		_free_hop(ctx, pgt_info);
+
+	return num_of_ptes_left;
+}
+
+static inline u64 get_hopN_pte_addr(struct hl_ctx *ctx, u64 hop_addr,
+					u64 virt_addr, u64 mask, u64 shift)
+{
+	return hop_addr + ctx->hdev->asic_prop.mmu_pte_size *
+			((virt_addr & mask) >> shift);
+}
+
+static inline u64 get_hop0_pte_addr(struct hl_ctx *ctx,
+					struct hl_mmu_properties *mmu_prop,
+					u64 hop_addr, u64 vaddr)
+{
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop0_mask,
+					mmu_prop->hop0_shift);
+}
+
+static inline u64 get_hop1_pte_addr(struct hl_ctx *ctx,
+					struct hl_mmu_properties *mmu_prop,
+					u64 hop_addr, u64 vaddr)
+{
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop1_mask,
+					mmu_prop->hop1_shift);
+}
+
+static inline u64 get_hop2_pte_addr(struct hl_ctx *ctx,
+					struct hl_mmu_properties *mmu_prop,
+					u64 hop_addr, u64 vaddr)
+{
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop2_mask,
+					mmu_prop->hop2_shift);
+}
+
+static inline u64 get_hop3_pte_addr(struct hl_ctx *ctx,
+					struct hl_mmu_properties *mmu_prop,
+					u64 hop_addr, u64 vaddr)
+{
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop3_mask,
+					mmu_prop->hop3_shift);
+}
+
+static inline u64 get_hop4_pte_addr(struct hl_ctx *ctx,
+					struct hl_mmu_properties *mmu_prop,
+					u64 hop_addr, u64 vaddr)
+{
+	return get_hopN_pte_addr(ctx, hop_addr, vaddr, mmu_prop->hop4_mask,
+					mmu_prop->hop4_shift);
+}
+
+static inline u64 get_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte)
+{
+	if (curr_pte & PAGE_PRESENT_MASK)
+		return curr_pte & HOP_PHYS_ADDR_MASK;
+	else
+		return ULLONG_MAX;
+}
+
+static inline u64 get_alloc_next_hop_addr(struct hl_ctx *ctx, u64 curr_pte,
+						bool *is_new_hop)
+{
+	u64 hop_addr = get_next_hop_addr(ctx, curr_pte);
+
+	if (hop_addr == ULLONG_MAX) {
+		hop_addr = alloc_hop(ctx);
+		*is_new_hop = (hop_addr != ULLONG_MAX);
+	}
+
+	return hop_addr;
+}
+
+/* translates shadow address inside hop to a physical address */
+static inline u64 get_phys_addr(struct hl_ctx *ctx, u64 shadow_addr)
+{
+	u64 page_mask = (ctx->hdev->asic_prop.mmu_hop_table_size - 1);
+	u64 shadow_hop_addr = shadow_addr & ~page_mask;
+	u64 pte_offset = shadow_addr & page_mask;
+	u64 phys_hop_addr;
+
+	if (shadow_hop_addr != get_hop0_addr(ctx))
+		phys_hop_addr = get_pgt_info(ctx, shadow_hop_addr)->phys_addr;
+	else
+		phys_hop_addr = get_phys_hop0_addr(ctx);
+
+	return phys_hop_addr + pte_offset;
+}
+
+static int dram_default_mapping_init(struct hl_ctx *ctx)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	u64 num_of_hop3, total_hops, hop0_addr, hop1_addr, hop2_addr,
+		hop2_pte_addr, hop3_pte_addr, pte_val;
+	int rc, i, j, hop3_allocated = 0;
+
+	if ((!hdev->dram_supports_virtual_memory) ||
+			(!hdev->dram_default_page_mapping) ||
+			(ctx->asid == HL_KERNEL_ASID_ID))
+		return 0;
+
+	num_of_hop3 = prop->dram_size_for_default_page_mapping;
+	do_div(num_of_hop3, prop->dram_page_size);
+	do_div(num_of_hop3, PTE_ENTRIES_IN_HOP);
+
+	/* add hop1 and hop2 */
+	total_hops = num_of_hop3 + 2;
+
+	ctx->dram_default_hops = kzalloc(HL_PTE_SIZE * total_hops,  GFP_KERNEL);
+	if (!ctx->dram_default_hops)
+		return -ENOMEM;
+
+	hop0_addr = get_hop0_addr(ctx);
+
+	hop1_addr = alloc_hop(ctx);
+	if (hop1_addr == ULLONG_MAX) {
+		dev_err(hdev->dev, "failed to alloc hop 1\n");
+		rc = -ENOMEM;
+		goto hop1_err;
+	}
+
+	ctx->dram_default_hops[total_hops - 1] = hop1_addr;
+
+	hop2_addr = alloc_hop(ctx);
+	if (hop2_addr == ULLONG_MAX) {
+		dev_err(hdev->dev, "failed to alloc hop 2\n");
+		rc = -ENOMEM;
+		goto hop2_err;
+	}
+
+	ctx->dram_default_hops[total_hops - 2] = hop2_addr;
+
+	for (i = 0 ; i < num_of_hop3 ; i++) {
+		ctx->dram_default_hops[i] = alloc_hop(ctx);
+		if (ctx->dram_default_hops[i] == ULLONG_MAX) {
+			dev_err(hdev->dev, "failed to alloc hop 3, i: %d\n", i);
+			rc = -ENOMEM;
+			goto hop3_err;
+		}
+		hop3_allocated++;
+	}
+
+	/* need only pte 0 in hops 0 and 1 */
+	pte_val = (hop1_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
+	write_pte(ctx, hop0_addr, pte_val);
+
+	pte_val = (hop2_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
+	write_pte(ctx, hop1_addr, pte_val);
+	get_pte(ctx, hop1_addr);
+
+	hop2_pte_addr = hop2_addr;
+	for (i = 0 ; i < num_of_hop3 ; i++) {
+		pte_val = (ctx->dram_default_hops[i] & HOP_PHYS_ADDR_MASK) |
+				PAGE_PRESENT_MASK;
+		write_pte(ctx, hop2_pte_addr, pte_val);
+		get_pte(ctx, hop2_addr);
+		hop2_pte_addr += HL_PTE_SIZE;
+	}
+
+	pte_val = (prop->mmu_dram_default_page_addr & HOP_PHYS_ADDR_MASK) |
+			LAST_MASK | PAGE_PRESENT_MASK;
+
+	for (i = 0 ; i < num_of_hop3 ; i++) {
+		hop3_pte_addr = ctx->dram_default_hops[i];
+		for (j = 0 ; j < PTE_ENTRIES_IN_HOP ; j++) {
+			write_final_pte(ctx, hop3_pte_addr, pte_val);
+			get_pte(ctx, ctx->dram_default_hops[i]);
+			hop3_pte_addr += HL_PTE_SIZE;
+		}
+	}
+
+	flush(ctx);
+
+	return 0;
+
+hop3_err:
+	for (i = 0 ; i < hop3_allocated ; i++)
+		free_hop(ctx, ctx->dram_default_hops[i]);
+
+	free_hop(ctx, hop2_addr);
+hop2_err:
+	free_hop(ctx, hop1_addr);
+hop1_err:
+	kfree(ctx->dram_default_hops);
+
+	return rc;
+}
+
+static void dram_default_mapping_fini(struct hl_ctx *ctx)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	u64 num_of_hop3, total_hops, hop0_addr, hop1_addr, hop2_addr,
+		hop2_pte_addr, hop3_pte_addr;
+	int i, j;
+
+	if ((!hdev->dram_supports_virtual_memory) ||
+			(!hdev->dram_default_page_mapping) ||
+			(ctx->asid == HL_KERNEL_ASID_ID))
+		return;
+
+	num_of_hop3 = prop->dram_size_for_default_page_mapping;
+	do_div(num_of_hop3, prop->dram_page_size);
+	do_div(num_of_hop3, PTE_ENTRIES_IN_HOP);
+
+	hop0_addr = get_hop0_addr(ctx);
+	/* add hop1 and hop2 */
+	total_hops = num_of_hop3 + 2;
+	hop1_addr = ctx->dram_default_hops[total_hops - 1];
+	hop2_addr = ctx->dram_default_hops[total_hops - 2];
+
+	for (i = 0 ; i < num_of_hop3 ; i++) {
+		hop3_pte_addr = ctx->dram_default_hops[i];
+		for (j = 0 ; j < PTE_ENTRIES_IN_HOP ; j++) {
+			clear_pte(ctx, hop3_pte_addr);
+			put_pte(ctx, ctx->dram_default_hops[i]);
+			hop3_pte_addr += HL_PTE_SIZE;
+		}
+	}
+
+	hop2_pte_addr = hop2_addr;
+	hop2_pte_addr = hop2_addr;
+	for (i = 0 ; i < num_of_hop3 ; i++) {
+		clear_pte(ctx, hop2_pte_addr);
+		put_pte(ctx, hop2_addr);
+		hop2_pte_addr += HL_PTE_SIZE;
+	}
+
+	clear_pte(ctx, hop1_addr);
+	put_pte(ctx, hop1_addr);
+	clear_pte(ctx, hop0_addr);
+
+	kfree(ctx->dram_default_hops);
+
+	flush(ctx);
+}
+
+/**
+ * hl_mmu_v1_init() - initialize the MMU module.
+ * @hdev: habanalabs device structure.
+ *
+ * This function does the following:
+ * - Create a pool of pages for pgt_infos.
+ * - Create a shadow table for pgt
+ *
+ * Return: 0 for success, non-zero for failure.
+ */
+static int hl_mmu_v1_init(struct hl_device *hdev)
+{
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	int rc;
+
+	hdev->mmu_priv.mmu_pgt_pool =
+			gen_pool_create(__ffs(prop->mmu_hop_table_size), -1);
+
+	if (!hdev->mmu_priv.mmu_pgt_pool) {
+		dev_err(hdev->dev, "Failed to create page gen pool\n");
+		return -ENOMEM;
+	}
+
+	rc = gen_pool_add(hdev->mmu_priv.mmu_pgt_pool, prop->mmu_pgt_addr +
+			prop->mmu_hop0_tables_total_size,
+			prop->mmu_pgt_size - prop->mmu_hop0_tables_total_size,
+			-1);
+	if (rc) {
+		dev_err(hdev->dev, "Failed to add memory to page gen pool\n");
+		goto err_pool_add;
+	}
+
+	hdev->mmu_priv.mmu_shadow_hop0 = kvmalloc_array(prop->max_asid,
+						prop->mmu_hop_table_size,
+						GFP_KERNEL | __GFP_ZERO);
+	if (ZERO_OR_NULL_PTR(hdev->mmu_priv.mmu_shadow_hop0)) {
+		rc = -ENOMEM;
+		goto err_pool_add;
+	}
+
+	/* MMU H/W init will be done in device hw_init() */
+
+	return 0;
+
+err_pool_add:
+	gen_pool_destroy(hdev->mmu_priv.mmu_pgt_pool);
+
+	return rc;
+}
+
+/**
+ * hl_mmu_fini() - release the MMU module.
+ * @hdev: habanalabs device structure.
+ *
+ * This function does the following:
+ * - Disable MMU in H/W.
+ * - Free the pgt_infos pool.
+ *
+ * All contexts should be freed before calling this function.
+ */
+static void hl_mmu_v1_fini(struct hl_device *hdev)
+{
+	/* MMU H/W fini was already done in device hw_fini() */
+
+	kvfree(hdev->mmu_priv.mmu_shadow_hop0);
+	gen_pool_destroy(hdev->mmu_priv.mmu_pgt_pool);
+}
+
+/**
+ * hl_mmu_ctx_init() - initialize a context for using the MMU module.
+ * @ctx: pointer to the context structure to initialize.
+ *
+ * Initialize a mutex to protect the concurrent mapping flow, a hash to hold all
+ * page tables hops related to this context.
+ * Return: 0 on success, non-zero otherwise.
+ */
+static int hl_mmu_v1_ctx_init(struct hl_ctx *ctx)
+{
+	mutex_init(&ctx->mmu_lock);
+	hash_init(ctx->mmu_shadow_hash);
+
+	return dram_default_mapping_init(ctx);
+}
+
+/*
+ * hl_mmu_ctx_fini - disable a ctx from using the mmu module
+ *
+ * @ctx: pointer to the context structure
+ *
+ * This function does the following:
+ * - Free any pgts which were not freed yet
+ * - Free the mutex
+ * - Free DRAM default page mapping hops
+ */
+static void hl_mmu_v1_ctx_fini(struct hl_ctx *ctx)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct pgt_info *pgt_info;
+	struct hlist_node *tmp;
+	int i;
+
+	dram_default_mapping_fini(ctx);
+
+	if (!hash_empty(ctx->mmu_shadow_hash))
+		dev_err(hdev->dev, "ctx %d is freed while it has pgts in use\n",
+			ctx->asid);
+
+	hash_for_each_safe(ctx->mmu_shadow_hash, i, tmp, pgt_info, node) {
+		dev_err_ratelimited(hdev->dev,
+			"pgt_info of addr 0x%llx of asid %d was not destroyed, num_ptes: %d\n",
+			pgt_info->phys_addr, ctx->asid, pgt_info->num_of_ptes);
+		_free_hop(ctx, pgt_info);
+	}
+
+	mutex_destroy(&ctx->mmu_lock);
+}
+
+static int _hl_mmu_v1_unmap(struct hl_ctx *ctx,
+				u64 virt_addr, bool is_dram_addr)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct hl_mmu_properties *mmu_prop;
+	u64 hop0_addr = 0, hop0_pte_addr = 0,
+		hop1_addr = 0, hop1_pte_addr = 0,
+		hop2_addr = 0, hop2_pte_addr = 0,
+		hop3_addr = 0, hop3_pte_addr = 0,
+		hop4_addr = 0, hop4_pte_addr = 0,
+		curr_pte;
+	bool is_huge, clear_hop3 = true;
+
+	/* shifts and masks are the same in PMMU and HPMMU, use one of them */
+	mmu_prop = is_dram_addr ? &prop->dmmu : &prop->pmmu;
+
+	hop0_addr = get_hop0_addr(ctx);
+	hop0_pte_addr = get_hop0_pte_addr(ctx, mmu_prop, hop0_addr, virt_addr);
+
+	curr_pte = *(u64 *) (uintptr_t) hop0_pte_addr;
+
+	hop1_addr = get_next_hop_addr(ctx, curr_pte);
+
+	if (hop1_addr == ULLONG_MAX)
+		goto not_mapped;
+
+	hop1_pte_addr = get_hop1_pte_addr(ctx, mmu_prop, hop1_addr, virt_addr);
+
+	curr_pte = *(u64 *) (uintptr_t) hop1_pte_addr;
+
+	hop2_addr = get_next_hop_addr(ctx, curr_pte);
+
+	if (hop2_addr == ULLONG_MAX)
+		goto not_mapped;
+
+	hop2_pte_addr = get_hop2_pte_addr(ctx, mmu_prop, hop2_addr, virt_addr);
+
+	curr_pte = *(u64 *) (uintptr_t) hop2_pte_addr;
+
+	hop3_addr = get_next_hop_addr(ctx, curr_pte);
+
+	if (hop3_addr == ULLONG_MAX)
+		goto not_mapped;
+
+	hop3_pte_addr = get_hop3_pte_addr(ctx, mmu_prop, hop3_addr, virt_addr);
+
+	curr_pte = *(u64 *) (uintptr_t) hop3_pte_addr;
+
+	is_huge = curr_pte & LAST_MASK;
+
+	if (is_dram_addr && !is_huge) {
+		dev_err(hdev->dev,
+				"DRAM unmapping should use huge pages only\n");
+		return -EFAULT;
+	}
+
+	if (!is_huge) {
+		hop4_addr = get_next_hop_addr(ctx, curr_pte);
+
+		if (hop4_addr == ULLONG_MAX)
+			goto not_mapped;
+
+		hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop, hop4_addr,
+							virt_addr);
+
+		curr_pte = *(u64 *) (uintptr_t) hop4_pte_addr;
+
+		clear_hop3 = false;
+	}
+
+	if (hdev->dram_default_page_mapping && is_dram_addr) {
+		u64 default_pte = (prop->mmu_dram_default_page_addr &
+				HOP_PHYS_ADDR_MASK) | LAST_MASK |
+					PAGE_PRESENT_MASK;
+		if (curr_pte == default_pte) {
+			dev_err(hdev->dev,
+				"DRAM: hop3 PTE points to zero page, can't unmap, va: 0x%llx\n",
+					virt_addr);
+			goto not_mapped;
+		}
+
+		if (!(curr_pte & PAGE_PRESENT_MASK)) {
+			dev_err(hdev->dev,
+				"DRAM: hop3 PTE is cleared! can't unmap, va: 0x%llx\n",
+					virt_addr);
+			goto not_mapped;
+		}
+
+		write_final_pte(ctx, hop3_pte_addr, default_pte);
+		put_pte(ctx, hop3_addr);
+	} else {
+		if (!(curr_pte & PAGE_PRESENT_MASK))
+			goto not_mapped;
+
+		if (hop4_addr)
+			clear_pte(ctx, hop4_pte_addr);
+		else
+			clear_pte(ctx, hop3_pte_addr);
+
+		if (hop4_addr && !put_pte(ctx, hop4_addr))
+			clear_hop3 = true;
+
+		if (!clear_hop3)
+			goto mapped;
+
+		clear_pte(ctx, hop3_pte_addr);
+
+		if (put_pte(ctx, hop3_addr))
+			goto mapped;
+
+		clear_pte(ctx, hop2_pte_addr);
+
+		if (put_pte(ctx, hop2_addr))
+			goto mapped;
+
+		clear_pte(ctx, hop1_pte_addr);
+
+		if (put_pte(ctx, hop1_addr))
+			goto mapped;
+
+		clear_pte(ctx, hop0_pte_addr);
+	}
+
+mapped:
+	return 0;
+
+not_mapped:
+	dev_err(hdev->dev, "virt addr 0x%llx is not mapped to phys addr\n",
+		virt_addr);
+
+	return -EINVAL;
+}
+
+static int _hl_mmu_v1_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
+			u32 page_size, bool is_dram_addr)
+{
+	struct hl_device *hdev = ctx->hdev;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct hl_mmu_properties *mmu_prop;
+	u64 hop0_addr = 0, hop0_pte_addr = 0,
+		hop1_addr = 0, hop1_pte_addr = 0,
+		hop2_addr = 0, hop2_pte_addr = 0,
+		hop3_addr = 0, hop3_pte_addr = 0,
+		hop4_addr = 0, hop4_pte_addr = 0,
+		curr_pte = 0;
+	bool hop1_new = false, hop2_new = false, hop3_new = false,
+		hop4_new = false, is_huge;
+	int rc = -ENOMEM;
+
+	/*
+	 * This mapping function can map a page or a huge page. For huge page
+	 * there are only 3 hops rather than 4. Currently the DRAM allocation
+	 * uses huge pages only but user memory could have been allocated with
+	 * one of the two page sizes. Since this is a common code for all the
+	 * three cases, we need this hugs page check.
+	 */
+	if (is_dram_addr) {
+		mmu_prop = &prop->dmmu;
+		is_huge = true;
+	} else if (page_size == prop->pmmu_huge.page_size) {
+		mmu_prop = &prop->pmmu_huge;
+		is_huge = true;
+	} else {
+		mmu_prop = &prop->pmmu;
+		is_huge = false;
+	}
+
+	hop0_addr = get_hop0_addr(ctx);
+	hop0_pte_addr = get_hop0_pte_addr(ctx, mmu_prop, hop0_addr, virt_addr);
+	curr_pte = *(u64 *) (uintptr_t) hop0_pte_addr;
+
+	hop1_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop1_new);
+	if (hop1_addr == ULLONG_MAX)
+		goto err;
+
+	hop1_pte_addr = get_hop1_pte_addr(ctx, mmu_prop, hop1_addr, virt_addr);
+	curr_pte = *(u64 *) (uintptr_t) hop1_pte_addr;
+
+	hop2_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop2_new);
+	if (hop2_addr == ULLONG_MAX)
+		goto err;
+
+	hop2_pte_addr = get_hop2_pte_addr(ctx, mmu_prop, hop2_addr, virt_addr);
+	curr_pte = *(u64 *) (uintptr_t) hop2_pte_addr;
+
+	hop3_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop3_new);
+	if (hop3_addr == ULLONG_MAX)
+		goto err;
+
+	hop3_pte_addr = get_hop3_pte_addr(ctx, mmu_prop, hop3_addr, virt_addr);
+	curr_pte = *(u64 *) (uintptr_t) hop3_pte_addr;
+
+	if (!is_huge) {
+		hop4_addr = get_alloc_next_hop_addr(ctx, curr_pte, &hop4_new);
+		if (hop4_addr == ULLONG_MAX)
+			goto err;
+
+		hop4_pte_addr = get_hop4_pte_addr(ctx, mmu_prop, hop4_addr,
+							virt_addr);
+		curr_pte = *(u64 *) (uintptr_t) hop4_pte_addr;
+	}
+
+	if (hdev->dram_default_page_mapping && is_dram_addr) {
+		u64 default_pte = (prop->mmu_dram_default_page_addr &
+					HOP_PHYS_ADDR_MASK) | LAST_MASK |
+						PAGE_PRESENT_MASK;
+
+		if (curr_pte != default_pte) {
+			dev_err(hdev->dev,
+				"DRAM: mapping already exists for virt_addr 0x%llx\n",
+					virt_addr);
+			rc = -EINVAL;
+			goto err;
+		}
+
+		if (hop1_new || hop2_new || hop3_new || hop4_new) {
+			dev_err(hdev->dev,
+				"DRAM mapping should not allocate more hops\n");
+			rc = -EFAULT;
+			goto err;
+		}
+	} else if (curr_pte & PAGE_PRESENT_MASK) {
+		dev_err(hdev->dev,
+			"mapping already exists for virt_addr 0x%llx\n",
+				virt_addr);
+
+		dev_dbg(hdev->dev, "hop0 pte: 0x%llx (0x%llx)\n",
+			*(u64 *) (uintptr_t) hop0_pte_addr, hop0_pte_addr);
+		dev_dbg(hdev->dev, "hop1 pte: 0x%llx (0x%llx)\n",
+			*(u64 *) (uintptr_t) hop1_pte_addr, hop1_pte_addr);
+		dev_dbg(hdev->dev, "hop2 pte: 0x%llx (0x%llx)\n",
+			*(u64 *) (uintptr_t) hop2_pte_addr, hop2_pte_addr);
+		dev_dbg(hdev->dev, "hop3 pte: 0x%llx (0x%llx)\n",
+			*(u64 *) (uintptr_t) hop3_pte_addr, hop3_pte_addr);
+
+		if (!is_huge)
+			dev_dbg(hdev->dev, "hop4 pte: 0x%llx (0x%llx)\n",
+				*(u64 *) (uintptr_t) hop4_pte_addr,
+				hop4_pte_addr);
+
+		rc = -EINVAL;
+		goto err;
+	}
+
+	curr_pte = (phys_addr & HOP_PHYS_ADDR_MASK) | LAST_MASK
+			| PAGE_PRESENT_MASK;
+
+	if (is_huge)
+		write_final_pte(ctx, hop3_pte_addr, curr_pte);
+	else
+		write_final_pte(ctx, hop4_pte_addr, curr_pte);
+
+	if (hop1_new) {
+		curr_pte =
+			(hop1_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
+		write_pte(ctx, hop0_pte_addr, curr_pte);
+	}
+	if (hop2_new) {
+		curr_pte =
+			(hop2_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
+		write_pte(ctx, hop1_pte_addr, curr_pte);
+		get_pte(ctx, hop1_addr);
+	}
+	if (hop3_new) {
+		curr_pte =
+			(hop3_addr & HOP_PHYS_ADDR_MASK) | PAGE_PRESENT_MASK;
+		write_pte(ctx, hop2_pte_addr, curr_pte);
+		get_pte(ctx, hop2_addr);
+	}
+
+	if (!is_huge) {
+		if (hop4_new) {
+			curr_pte = (hop4_addr & HOP_PHYS_ADDR_MASK) |
+					PAGE_PRESENT_MASK;
+			write_pte(ctx, hop3_pte_addr, curr_pte);
+			get_pte(ctx, hop3_addr);
+		}
+
+		get_pte(ctx, hop4_addr);
+	} else {
+		get_pte(ctx, hop3_addr);
+	}
+
+	return 0;
+
+err:
+	if (hop4_new)
+		free_hop(ctx, hop4_addr);
+	if (hop3_new)
+		free_hop(ctx, hop3_addr);
+	if (hop2_new)
+		free_hop(ctx, hop2_addr);
+	if (hop1_new)
+		free_hop(ctx, hop1_addr);
+
+	return rc;
+}
+
+/*
+ * hl_mmu_v1_swap_out - marks all mapping of the given ctx as swapped out
+ *
+ * @ctx: pointer to the context structure
+ *
+ */
+static void hl_mmu_v1_swap_out(struct hl_ctx *ctx)
+{
+
+}
+
+/*
+ * hl_mmu_v1_swap_in - marks all mapping of the given ctx as swapped in
+ *
+ * @ctx: pointer to the context structure
+ *
+ */
+static void hl_mmu_v1_swap_in(struct hl_ctx *ctx)
+{
+
+}
+
+/*
+ * hl_mmu_v1_prepare - prepare mmu  for working with mmu v1
+ *
+ * @hdev: pointer to the device structure
+ */
+void hl_mmu_v1_set_funcs(struct hl_device *hdev)
+{
+	struct hl_mmu_funcs *mmu = &hdev->mmu_func;
+
+	mmu->init = hl_mmu_v1_init;
+	mmu->fini = hl_mmu_v1_fini;
+	mmu->ctx_init = hl_mmu_v1_ctx_init;
+	mmu->ctx_fini = hl_mmu_v1_ctx_fini;
+	mmu->map = _hl_mmu_v1_map;
+	mmu->unmap = _hl_mmu_v1_unmap;
+	mmu->flush = flush;
+	mmu->swap_out = hl_mmu_v1_swap_out;
+	mmu->swap_in = hl_mmu_v1_swap_in;
+}
diff --git a/drivers/misc/habanalabs/common/pci.c b/drivers/misc/habanalabs/common/pci.c
index 7bd3737571f3..4327e5704ebb 100644
--- a/drivers/misc/habanalabs/common/pci.c
+++ b/drivers/misc/habanalabs/common/pci.c
@@ -9,7 +9,6 @@
 #include "../include/hw_ip/pci/pci_general.h"
 
 #include <linux/pci.h>
-#include <linux/bitfield.h>
 
 #define HL_PLDM_PCI_ELBI_TIMEOUT_MSEC	(HL_PCI_ELBI_TIMEOUT_MSEC * 10)
 
@@ -227,7 +226,7 @@ int hl_pci_set_inbound_region(struct hl_device *hdev, u8 region,
 	}
 
 	/* Point to the specified address */
-	rc = hl_pci_iatu_write(hdev, offset + 0x14,
+	rc |= hl_pci_iatu_write(hdev, offset + 0x14,
 			lower_32_bits(pci_region->addr));
 	rc |= hl_pci_iatu_write(hdev, offset + 0x18,
 			upper_32_bits(pci_region->addr));
@@ -339,12 +338,17 @@ static int hl_pci_set_dma_mask(struct hl_device *hdev)
 /**
  * hl_pci_init() - PCI initialization code.
  * @hdev: Pointer to hl_device structure.
+ * @cpu_boot_status_reg: status register of the device's CPU
+ * @boot_err0_reg: boot error register of the device's CPU
+ * @preboot_ver_timeout: how much to wait before bailing out on reading
+ *                       the preboot version
  *
  * Set DMA masks, initialize the PCI controller and map the PCI BARs.
  *
  * Return: 0 on success, non-zero for failure.
  */
-int hl_pci_init(struct hl_device *hdev)
+int hl_pci_init(struct hl_device *hdev, u32 cpu_boot_status_reg,
+		u32 boot_err0_reg, u32 preboot_ver_timeout)
 {
 	struct pci_dev *pdev = hdev->pdev;
 	int rc;
@@ -369,15 +373,26 @@ int hl_pci_init(struct hl_device *hdev)
 	rc = hdev->asic_funcs->init_iatu(hdev);
 	if (rc) {
 		dev_err(hdev->dev, "Failed to initialize iATU\n");
-		goto disable_device;
+		goto unmap_pci_bars;
 	}
 
 	rc = hl_pci_set_dma_mask(hdev);
 	if (rc)
-		goto disable_device;
+		goto unmap_pci_bars;
+
+	/* Before continuing in the initialization, we need to read the preboot
+	 * version to determine whether we run with a security-enabled firmware
+	 * The check will be done in each ASIC's specific code
+	 */
+	rc = hl_fw_read_preboot_ver(hdev, cpu_boot_status_reg, boot_err0_reg,
+					preboot_ver_timeout);
+	if (rc)
+		goto unmap_pci_bars;
 
 	return 0;
 
+unmap_pci_bars:
+	hl_pci_bars_unmap(hdev);
 disable_device:
 	pci_clear_master(pdev);
 	pci_disable_device(pdev);
diff --git a/drivers/misc/habanalabs/common/sysfs.c b/drivers/misc/habanalabs/common/sysfs.c
index b3cb0ac4721c..3ceae87016b1 100644
--- a/drivers/misc/habanalabs/common/sysfs.c
+++ b/drivers/misc/habanalabs/common/sysfs.c
@@ -11,18 +11,18 @@
 
 long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	long result;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
 
 	if (curr)
-		pkt.ctl = cpu_to_le32(ARMCP_PACKET_FREQUENCY_CURR_GET <<
-						ARMCP_PKT_CTL_OPCODE_SHIFT);
+		pkt.ctl = cpu_to_le32(CPUCP_PACKET_FREQUENCY_CURR_GET <<
+						CPUCP_PKT_CTL_OPCODE_SHIFT);
 	else
-		pkt.ctl = cpu_to_le32(ARMCP_PACKET_FREQUENCY_GET <<
-						ARMCP_PKT_CTL_OPCODE_SHIFT);
+		pkt.ctl = cpu_to_le32(CPUCP_PACKET_FREQUENCY_GET <<
+						CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.pll_index = cpu_to_le32(pll_index);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
@@ -40,13 +40,13 @@ long hl_get_frequency(struct hl_device *hdev, u32 pll_index, bool curr)
 
 void hl_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
 
-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_FREQUENCY_SET <<
-					ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_FREQUENCY_SET <<
+					CPUCP_PKT_CTL_OPCODE_SHIFT);
 	pkt.pll_index = cpu_to_le32(pll_index);
 	pkt.value = cpu_to_le64(freq);
 
@@ -61,14 +61,14 @@ void hl_set_frequency(struct hl_device *hdev, u32 pll_index, u64 freq)
 
 u64 hl_get_max_power(struct hl_device *hdev)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	long result;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
 
-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_MAX_POWER_GET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_MAX_POWER_GET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
 						0, &result);
@@ -81,16 +81,16 @@ u64 hl_get_max_power(struct hl_device *hdev)
 	return result;
 }
 
-void hl_set_max_power(struct hl_device *hdev, u64 value)
+void hl_set_max_power(struct hl_device *hdev)
 {
-	struct armcp_packet pkt;
+	struct cpucp_packet pkt;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
 
-	pkt.ctl = cpu_to_le32(ARMCP_PACKET_MAX_POWER_SET <<
-				ARMCP_PKT_CTL_OPCODE_SHIFT);
-	pkt.value = cpu_to_le64(value);
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_MAX_POWER_SET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.value = cpu_to_le64(hdev->max_power);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
 						0, NULL);
@@ -112,7 +112,7 @@ static ssize_t armcp_kernel_ver_show(struct device *dev,
 {
 	struct hl_device *hdev = dev_get_drvdata(dev);
 
-	return sprintf(buf, "%s", hdev->asic_prop.armcp_info.kernel_version);
+	return sprintf(buf, "%s", hdev->asic_prop.cpucp_info.kernel_version);
 }
 
 static ssize_t armcp_ver_show(struct device *dev, struct device_attribute *attr,
@@ -120,7 +120,7 @@ static ssize_t armcp_ver_show(struct device *dev, struct device_attribute *attr,
 {
 	struct hl_device *hdev = dev_get_drvdata(dev);
 
-	return sprintf(buf, "%s\n", hdev->asic_prop.armcp_info.armcp_version);
+	return sprintf(buf, "%s\n", hdev->asic_prop.cpucp_info.cpucp_version);
 }
 
 static ssize_t cpld_ver_show(struct device *dev, struct device_attribute *attr,
@@ -129,7 +129,23 @@ static ssize_t cpld_ver_show(struct device *dev, struct device_attribute *attr,
 	struct hl_device *hdev = dev_get_drvdata(dev);
 
 	return sprintf(buf, "0x%08x\n",
-			hdev->asic_prop.armcp_info.cpld_version);
+			hdev->asic_prop.cpucp_info.cpld_version);
+}
+
+static ssize_t cpucp_kernel_ver_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct hl_device *hdev = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%s", hdev->asic_prop.cpucp_info.kernel_version);
+}
+
+static ssize_t cpucp_ver_show(struct device *dev, struct device_attribute *attr,
+				char *buf)
+{
+	struct hl_device *hdev = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%s\n", hdev->asic_prop.cpucp_info.cpucp_version);
 }
 
 static ssize_t infineon_ver_show(struct device *dev,
@@ -138,7 +154,7 @@ static ssize_t infineon_ver_show(struct device *dev,
 	struct hl_device *hdev = dev_get_drvdata(dev);
 
 	return sprintf(buf, "0x%04x\n",
-			hdev->asic_prop.armcp_info.infineon_version);
+			hdev->asic_prop.cpucp_info.infineon_version);
 }
 
 static ssize_t fuse_ver_show(struct device *dev, struct device_attribute *attr,
@@ -146,7 +162,7 @@ static ssize_t fuse_ver_show(struct device *dev, struct device_attribute *attr,
 {
 	struct hl_device *hdev = dev_get_drvdata(dev);
 
-	return sprintf(buf, "%s\n", hdev->asic_prop.armcp_info.fuse_version);
+	return sprintf(buf, "%s\n", hdev->asic_prop.cpucp_info.fuse_version);
 }
 
 static ssize_t thermal_ver_show(struct device *dev,
@@ -154,7 +170,7 @@ static ssize_t thermal_ver_show(struct device *dev,
 {
 	struct hl_device *hdev = dev_get_drvdata(dev);
 
-	return sprintf(buf, "%s", hdev->asic_prop.armcp_info.thermal_version);
+	return sprintf(buf, "%s", hdev->asic_prop.cpucp_info.thermal_version);
 }
 
 static ssize_t preboot_btl_ver_show(struct device *dev,
@@ -316,7 +332,7 @@ static ssize_t max_power_store(struct device *dev,
 	}
 
 	hdev->max_power = value;
-	hl_set_max_power(hdev, value);
+	hl_set_max_power(hdev);
 
 out:
 	return count;
@@ -356,6 +372,8 @@ out:
 static DEVICE_ATTR_RO(armcp_kernel_ver);
 static DEVICE_ATTR_RO(armcp_ver);
 static DEVICE_ATTR_RO(cpld_ver);
+static DEVICE_ATTR_RO(cpucp_kernel_ver);
+static DEVICE_ATTR_RO(cpucp_ver);
 static DEVICE_ATTR_RO(device_type);
 static DEVICE_ATTR_RO(fuse_ver);
 static DEVICE_ATTR_WO(hard_reset);
@@ -380,6 +398,8 @@ static struct attribute *hl_dev_attrs[] = {
 	&dev_attr_armcp_kernel_ver.attr,
 	&dev_attr_armcp_ver.attr,
 	&dev_attr_cpld_ver.attr,
+	&dev_attr_cpucp_kernel_ver.attr,
+	&dev_attr_cpucp_ver.attr,
 	&dev_attr_device_type.attr,
 	&dev_attr_fuse_ver.attr,
 	&dev_attr_hard_reset.attr,
@@ -422,6 +442,7 @@ int hl_sysfs_init(struct hl_device *hdev)
 		hdev->pm_mng_profile = PM_AUTO;
 	else
 		hdev->pm_mng_profile = PM_MANUAL;
+
 	hdev->max_power = hdev->asic_prop.max_power_default;
 
 	hdev->asic_funcs->add_device_attr(hdev, &hl_dev_clks_attr_group);
author	Peter Zijlstra <peterz@infradead.org>	2020-11-26 13:16:55 +0100
committer	Peter Zijlstra <peterz@infradead.org>	2020-11-26 13:16:55 +0100
commit	20c7775aecea04d8ca322039969d49dcf568e0e9 (patch)
tree	138c057839197c9021043353e994815c0250e669 /drivers/misc/habanalabs/common
parent	perf/x86/intel: Add event constraint for CYCLE_ACTIVITY.STALLS_MEM_ANY (diff)
parent	Merge tag 'media/v5.10-2' of git://git.kernel.org/pub/scm/linux/kernel/git/mchehab/linux-media (diff)
download	linux-dev-20c7775aecea04d8ca322039969d49dcf568e0e9.tar.xz linux-dev-20c7775aecea04d8ca322039969d49dcf568e0e9.zip