From ed65bfd9fd86dec3772570b0320ca85b9fb69f2e Mon Sep 17 00:00:00 2001
From: Daniel Vetter <daniel.vetter@ffwll.ch>
Date: Mon, 11 May 2020 11:11:42 +0200
Subject: [PATCH 1/6] habanalabs: don't set default fence_ops->wait

It's the default.

Also so much for "we're not going to tell the graphics people how to
review their code", dma_fence is a pretty core piece of gpu driver
infrastructure. And it's very much uapi relevant, including piles of
corresponding userspace protocols and libraries for how to pass these
around.

Would be great if habanalabs would not use this (from a quick look
it's not needed at all), since open source the userspace and playing
by the usual rules isn't on the table. If that's not possible (because
it's actually using the uapi part of dma_fence to interact with gpu
drivers) then we have exactly what everyone promised we'd want to
avoid.

Signed-off-by: Daniel Vetter <daniel.vetter@intel.com>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/command_submission.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c
index 7da9847eb9d6c..75d8302352e5d 100644
--- a/drivers/misc/habanalabs/command_submission.c
+++ b/drivers/misc/habanalabs/command_submission.c
@@ -99,7 +99,6 @@ static const struct dma_fence_ops hl_fence_ops = {
 	.get_driver_name = hl_fence_get_driver_name,
 	.get_timeline_name = hl_fence_get_timeline_name,
 	.enable_signaling = hl_fence_enable_signaling,
-	.wait = dma_fence_default_wait,
 	.release = hl_fence_release
 };
 

From 42d0b0b95f9822fc02440c0ae5f9192c0ebe54bf Mon Sep 17 00:00:00 2001
From: Omer Shpigelman <oshpigelman@habana.ai>
Date: Sun, 17 May 2020 17:35:39 +0300
Subject: [PATCH 2/6] habanalabs: improve MMU cache invalidation code

A new sequence is introduced to invalidate the MMU cache in order to avoid
timeouts.

Signed-off-by: Omer Shpigelman <oshpigelman@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/gaudi/gaudi.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 4cb1f71dd4f1e..093384731f0d0 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -5982,16 +5982,18 @@ static void gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
 		timeout_usec = MMU_CONFIG_TIMEOUT_USEC;
 
 	/* L0 & L1 invalidation */
-	WREG32(mmSTLB_INV_ALL_START, 1);
+	WREG32(mmSTLB_INV_PS, 2);
 
 	rc = hl_poll_timeout(
 		hdev,
-		mmSTLB_INV_ALL_START,
+		mmSTLB_INV_PS,
 		status,
 		!status,
 		1000,
 		timeout_usec);
 
+	WREG32(mmSTLB_INV_SET, 0);
+
 	if (rc)
 		dev_notice_ratelimited(hdev->dev,
 			"Timeout when waiting for MMU cache invalidation\n");

From d798507988da3d40032c57cb87244450842e4be1 Mon Sep 17 00:00:00 2001
From: Omer Shpigelman <oshpigelman@habana.ai>
Date: Sun, 17 May 2020 23:01:22 +0300
Subject: [PATCH 3/6] habanalabs: add print for soft reset due to event

Print the event name that caused the soft reset.

Signed-off-by: Omer Shpigelman <oshpigelman@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/gaudi/gaudi.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 093384731f0d0..3d4a569914d3c 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -5843,8 +5843,12 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 		soft_reset_required = gaudi_tpc_read_interrupts(hdev,
 					tpc_dec_event_to_tpc_id(event_type),
 					"AXI_SLV_DEC_Error");
-		if (soft_reset_required)
+		if (soft_reset_required) {
+			dev_err_ratelimited(hdev->dev,
+					"soft reset required due to %s\n",
+					gaudi_irq_map_table[event_type].name);
 			hl_device_reset(hdev, false, false);
+		}
 		hl_fw_unmask_irq(hdev, event_type);
 		break;
 
@@ -5860,8 +5864,12 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 		soft_reset_required = gaudi_tpc_read_interrupts(hdev,
 					tpc_krn_event_to_tpc_id(event_type),
 					"KRN_ERR");
-		if (soft_reset_required)
+		if (soft_reset_required) {
+			dev_err_ratelimited(hdev->dev,
+					"soft reset required due to %s\n",
+					gaudi_irq_map_table[event_type].name);
 			hl_device_reset(hdev, false, false);
+		}
 		hl_fw_unmask_irq(hdev, event_type);
 		break;
 

From 66446820df1864bc371ece679c11152f5f22362c Mon Sep 17 00:00:00 2001
From: Oded Gabbay <oded.gabbay@gmail.com>
Date: Mon, 18 May 2020 16:48:01 +0300
Subject: [PATCH 4/6] habanalabs: GAUDI does not support soft-reset

GAUDI does not support soft-reset as it leaves the NIC ports in an awkward
state, where their QMANs were reset but the NIC itself is still working.

In addition, there is not much sense in doing soft-reset when training is
done on multiple GAUDIs.

Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
Reviewed-by: Tomer Tayar <ttayar@habana.ai>
---
 drivers/misc/habanalabs/device.c      |  6 +++++
 drivers/misc/habanalabs/gaudi/gaudi.c | 38 +++++++++++++++------------
 drivers/misc/habanalabs/goya/goya.c   |  1 +
 drivers/misc/habanalabs/habanalabs.h  |  2 ++
 drivers/misc/habanalabs/sysfs.c       |  5 ++++
 5 files changed, 35 insertions(+), 17 deletions(-)

diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c
index 4b6c8de46dd86..4a4a446f479e8 100644
--- a/drivers/misc/habanalabs/device.c
+++ b/drivers/misc/habanalabs/device.c
@@ -801,6 +801,7 @@ static void device_hard_reset_pending(struct work_struct *work)
  * @hdev: pointer to habanalabs device structure
  * @hard_reset: should we do hard reset to all engines or just reset the
  *              compute/dma engines
+ * @from_hard_reset_thread: is the caller the hard-reset thread
  *
  * Block future CS and wait for pending CS to be enqueued
  * Call ASIC H/W fini
@@ -823,6 +824,11 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
 		return 0;
 	}
 
+	if ((!hard_reset) && (!hdev->supports_soft_reset)) {
+		dev_dbg(hdev->dev, "Doing hard-reset instead of soft-reset\n");
+		hard_reset = true;
+	}
+
 	/*
 	 * Prevent concurrency in this function - only one reset should be
 	 * done at any given time. Only need to perform this if we didn't
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 3d4a569914d3c..92a5130f06fbb 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -5774,7 +5774,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 	u16 event_type = ((ctl & EQ_CTL_EVENT_TYPE_MASK)
 			>> EQ_CTL_EVENT_TYPE_SHIFT);
 	u8 cause;
-	bool soft_reset_required;
+	bool reset_required;
 
 	gaudi->events_stat[event_type]++;
 	gaudi->events_stat_aggregate[event_type]++;
@@ -5840,16 +5840,18 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 	case GAUDI_EVENT_TPC6_DEC:
 	case GAUDI_EVENT_TPC7_DEC:
 		gaudi_print_irq_info(hdev, event_type, true);
-		soft_reset_required = gaudi_tpc_read_interrupts(hdev,
+		reset_required = gaudi_tpc_read_interrupts(hdev,
 					tpc_dec_event_to_tpc_id(event_type),
 					"AXI_SLV_DEC_Error");
-		if (soft_reset_required) {
-			dev_err_ratelimited(hdev->dev,
-					"soft reset required due to %s\n",
-					gaudi_irq_map_table[event_type].name);
-			hl_device_reset(hdev, false, false);
+		if (reset_required) {
+			dev_err(hdev->dev, "hard reset required due to %s\n",
+				gaudi_irq_map_table[event_type].name);
+
+			if (hdev->hard_reset_on_fw_events)
+				hl_device_reset(hdev, true, false);
+		} else {
+			hl_fw_unmask_irq(hdev, event_type);
 		}
-		hl_fw_unmask_irq(hdev, event_type);
 		break;
 
 	case GAUDI_EVENT_TPC0_KRN_ERR:
@@ -5861,16 +5863,18 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 	case GAUDI_EVENT_TPC6_KRN_ERR:
 	case GAUDI_EVENT_TPC7_KRN_ERR:
 		gaudi_print_irq_info(hdev, event_type, true);
-		soft_reset_required = gaudi_tpc_read_interrupts(hdev,
+		reset_required = gaudi_tpc_read_interrupts(hdev,
 					tpc_krn_event_to_tpc_id(event_type),
 					"KRN_ERR");
-		if (soft_reset_required) {
-			dev_err_ratelimited(hdev->dev,
-					"soft reset required due to %s\n",
-					gaudi_irq_map_table[event_type].name);
-			hl_device_reset(hdev, false, false);
+		if (reset_required) {
+			dev_err(hdev->dev, "hard reset required due to %s\n",
+				gaudi_irq_map_table[event_type].name);
+
+			if (hdev->hard_reset_on_fw_events)
+				hl_device_reset(hdev, true, false);
+		} else {
+			hl_fw_unmask_irq(hdev, event_type);
 		}
-		hl_fw_unmask_irq(hdev, event_type);
 		break;
 
 	case GAUDI_EVENT_PCIE_CORE_SERR:
@@ -5921,8 +5925,8 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 
 	case GAUDI_EVENT_RAZWI_OR_ADC_SW:
 		gaudi_print_irq_info(hdev, event_type, true);
-		hl_device_reset(hdev, false, false);
-		hl_fw_unmask_irq(hdev, event_type);
+		if (hdev->hard_reset_on_fw_events)
+			hl_device_reset(hdev, true, false);
 		break;
 
 	case GAUDI_EVENT_TPC0_BMON_SPMU:
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 15b6c3228e37c..152418dfe20c3 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -752,6 +752,7 @@ static int goya_sw_init(struct hl_device *hdev)
 
 	spin_lock_init(&goya->hw_queues_lock);
 	hdev->supports_coresight = true;
+	hdev->supports_soft_reset = true;
 
 	return 0;
 
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 5a855b7edf430..0f0691875298d 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -1436,6 +1436,7 @@ struct hl_device_idle_busy_ts {
  * @stop_on_err: true if engines should stop on error.
  * @supports_sync_stream: is sync stream supported.
  * @supports_coresight: is CoreSight supported.
+ * @supports_soft_reset: is soft reset supported.
  */
 struct hl_device {
 	struct pci_dev			*pdev;
@@ -1522,6 +1523,7 @@ struct hl_device {
 	u8				stop_on_err;
 	u8				supports_sync_stream;
 	u8				supports_coresight;
+	u8				supports_soft_reset;
 
 	/* Parameters for bring-up */
 	u8				mmu_enable;
diff --git a/drivers/misc/habanalabs/sysfs.c b/drivers/misc/habanalabs/sysfs.c
index e4454414d0e10..5d78d5e1c7826 100644
--- a/drivers/misc/habanalabs/sysfs.c
+++ b/drivers/misc/habanalabs/sysfs.c
@@ -183,6 +183,11 @@ static ssize_t soft_reset_store(struct device *dev,
 		goto out;
 	}
 
+	if (!hdev->supports_soft_reset) {
+		dev_err(hdev->dev, "Device does not support soft-reset\n");
+		goto out;
+	}
+
 	dev_warn(hdev->dev, "Soft-Reset requested through sysfs\n");
 
 	hl_device_reset(hdev, false, false);

From 36fafe87edd636292a4ed6a3af9608f2c7d0d0fb Mon Sep 17 00:00:00 2001
From: Omer Shpigelman <oshpigelman@habana.ai>
Date: Mon, 18 May 2020 22:27:46 +0300
Subject: [PATCH 5/6] habanalabs: don't allow hard reset with open processes

When the MMU is heavily used by the engines, unmapping might take a lot of
time due to a full MMU cache invalidation done as part of the unmap flow.
Hence we might not be able to kill all open processes before going to hard
reset the device, as it involves unmapping of all user memory.
In case of a failure in killing all open processes, we should stop the
hard reset flow as it might lead to a kernel crash - one thread (killing
of a process) is updating MMU structures that other thread (hard reset) is
freeing.
Stopping a hard reset flow leaves the device as nonoperational and the
user can then initiate a hard reset via sysfs to reinitialize the device.

Signed-off-by: Omer Shpigelman <oshpigelman@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/device.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/misc/habanalabs/device.c b/drivers/misc/habanalabs/device.c
index 4a4a446f479e8..2b38a119704c4 100644
--- a/drivers/misc/habanalabs/device.c
+++ b/drivers/misc/habanalabs/device.c
@@ -726,7 +726,7 @@ int hl_device_resume(struct hl_device *hdev)
 	return rc;
 }
 
-static void device_kill_open_processes(struct hl_device *hdev)
+static int device_kill_open_processes(struct hl_device *hdev)
 {
 	u16 pending_total, pending_cnt;
 	struct hl_fpriv	*hpriv;
@@ -779,9 +779,7 @@ static void device_kill_open_processes(struct hl_device *hdev)
 		ssleep(1);
 	}
 
-	if (!list_empty(&hdev->fpriv_list))
-		dev_crit(hdev->dev,
-			"Going to hard reset with open user contexts\n");
+	return list_empty(&hdev->fpriv_list) ? 0 : -EBUSY;
 }
 
 static void device_hard_reset_pending(struct work_struct *work)
@@ -908,7 +906,12 @@ int hl_device_reset(struct hl_device *hdev, bool hard_reset,
 		 * process can't really exit until all its CSs are done, which
 		 * is what we do in cs rollback
 		 */
-		device_kill_open_processes(hdev);
+		rc = device_kill_open_processes(hdev);
+		if (rc) {
+			dev_crit(hdev->dev,
+				"Failed to kill all open processes, stopping hard reset\n");
+			goto out_err;
+		}
 
 		/* Flush the Event queue workers to make sure no other thread is
 		 * reading or writing to registers during the reset
@@ -1391,7 +1394,9 @@ void hl_device_fini(struct hl_device *hdev)
 	 * can't really exit until all its CSs are done, which is what we
 	 * do in cs rollback
 	 */
-	device_kill_open_processes(hdev);
+	rc = device_kill_open_processes(hdev);
+	if (rc)
+		dev_crit(hdev->dev, "Failed to kill all open processes\n");
 
 	hl_cb_pool_fini(hdev);
 

From 8ff5f4fd40df9525675ea0e512da4cec65d646eb Mon Sep 17 00:00:00 2001
From: Omer Shpigelman <oshpigelman@habana.ai>
Date: Sun, 24 May 2020 23:06:59 +0300
Subject: [PATCH 6/6] habanalabs: handle MMU cache invalidation timeout

MMU cache invalidation timeout indicates that the device is unstable and
therefore unusable.
Hence in such case do hard reset and return an error to the user if was
called from ioctl.
In addition, change the print to error level and rephrase its text.

Signed-off-by: Omer Shpigelman <oshpigelman@habana.ai>
Reviewed-by: Oded Gabbay <oded.gabbay@gmail.com>
Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
---
 drivers/misc/habanalabs/gaudi/gaudi.c | 36 ++++++++++++++++-----------
 drivers/misc/habanalabs/goya/goya.c   | 34 +++++++++++++++----------
 drivers/misc/habanalabs/habanalabs.h  |  8 +++---
 drivers/misc/habanalabs/memory.c      | 35 ++++++++++++++++++++------
 4 files changed, 75 insertions(+), 38 deletions(-)

diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 92a5130f06fbb..61f88e9884ce9 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -5975,7 +5975,7 @@ static void *gaudi_get_events_stat(struct hl_device *hdev, bool aggregate,
 	return gaudi->events_stat;
 }
 
-static void gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
+static int gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
 					u32 flags)
 {
 	struct gaudi_device *gaudi = hdev->asic_specific;
@@ -5984,15 +5984,15 @@ static void gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
 
 	if (!(gaudi->hw_cap_initialized & HW_CAP_MMU) ||
 		hdev->hard_reset_pending)
-		return;
-
-	mutex_lock(&hdev->mmu_cache_lock);
+		return 0;
 
 	if (hdev->pldm)
 		timeout_usec = GAUDI_PLDM_MMU_TIMEOUT_USEC;
 	else
 		timeout_usec = MMU_CONFIG_TIMEOUT_USEC;
 
+	mutex_lock(&hdev->mmu_cache_lock);
+
 	/* L0 & L1 invalidation */
 	WREG32(mmSTLB_INV_PS, 2);
 
@@ -6006,14 +6006,18 @@ static void gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
 
 	WREG32(mmSTLB_INV_SET, 0);
 
-	if (rc)
-		dev_notice_ratelimited(hdev->dev,
-			"Timeout when waiting for MMU cache invalidation\n");
-
 	mutex_unlock(&hdev->mmu_cache_lock);
+
+	if (rc) {
+		dev_err_ratelimited(hdev->dev,
+					"MMU cache invalidation timeout\n");
+		hl_device_reset(hdev, true, false);
+	}
+
+	return rc;
 }
 
-static void gaudi_mmu_invalidate_cache_range(struct hl_device *hdev,
+static int gaudi_mmu_invalidate_cache_range(struct hl_device *hdev,
 				bool is_hard, u32 asid, u64 va, u64 size)
 {
 	struct gaudi_device *gaudi = hdev->asic_specific;
@@ -6024,7 +6028,7 @@ static void gaudi_mmu_invalidate_cache_range(struct hl_device *hdev,
 
 	if (!(gaudi->hw_cap_initialized & HW_CAP_MMU) ||
 		hdev->hard_reset_pending)
-		return;
+		return 0;
 
 	mutex_lock(&hdev->mmu_cache_lock);
 
@@ -6055,11 +6059,15 @@ static void gaudi_mmu_invalidate_cache_range(struct hl_device *hdev,
 		1000,
 		timeout_usec);
 
-	if (rc)
-		dev_notice_ratelimited(hdev->dev,
-			"Timeout when waiting for MMU cache invalidation\n");
-
 	mutex_unlock(&hdev->mmu_cache_lock);
+
+	if (rc) {
+		dev_err_ratelimited(hdev->dev,
+					"MMU cache invalidation timeout\n");
+		hl_device_reset(hdev, true, false);
+	}
+
+	return rc;
 }
 
 static int gaudi_mmu_update_asid_hop0_addr(struct hl_device *hdev,
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 152418dfe20c3..0d2952bb58dfb 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -4884,7 +4884,7 @@ static void goya_mmu_prepare(struct hl_device *hdev, u32 asid)
 		goya_mmu_prepare_reg(hdev, goya_mmu_regs[i], asid);
 }
 
-static void goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
+static int goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
 					u32 flags)
 {
 	struct goya_device *goya = hdev->asic_specific;
@@ -4893,11 +4893,11 @@ static void goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
 
 	if (!(goya->hw_cap_initialized & HW_CAP_MMU) ||
 		hdev->hard_reset_pending)
-		return;
+		return 0;
 
 	/* no need in L1 only invalidation in Goya */
 	if (!is_hard)
-		return;
+		return 0;
 
 	if (hdev->pldm)
 		timeout_usec = GOYA_PLDM_MMU_TIMEOUT_USEC;
@@ -4919,13 +4919,17 @@ static void goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
 
 	mutex_unlock(&hdev->mmu_cache_lock);
 
-	if (rc)
-		dev_notice_ratelimited(hdev->dev,
-			"Timeout when waiting for MMU cache invalidation\n");
+	if (rc) {
+		dev_err_ratelimited(hdev->dev,
+					"MMU cache invalidation timeout\n");
+		hl_device_reset(hdev, true, false);
+	}
+
+	return rc;
 }
 
-static void goya_mmu_invalidate_cache_range(struct hl_device *hdev,
-		bool is_hard, u32 asid, u64 va, u64 size)
+static int goya_mmu_invalidate_cache_range(struct hl_device *hdev,
+				bool is_hard, u32 asid, u64 va, u64 size)
 {
 	struct goya_device *goya = hdev->asic_specific;
 	u32 status, timeout_usec, inv_data, pi;
@@ -4933,11 +4937,11 @@ static void goya_mmu_invalidate_cache_range(struct hl_device *hdev,
 
 	if (!(goya->hw_cap_initialized & HW_CAP_MMU) ||
 		hdev->hard_reset_pending)
-		return;
+		return 0;
 
 	/* no need in L1 only invalidation in Goya */
 	if (!is_hard)
-		return;
+		return 0;
 
 	if (hdev->pldm)
 		timeout_usec = GOYA_PLDM_MMU_TIMEOUT_USEC;
@@ -4970,9 +4974,13 @@ static void goya_mmu_invalidate_cache_range(struct hl_device *hdev,
 
 	mutex_unlock(&hdev->mmu_cache_lock);
 
-	if (rc)
-		dev_notice_ratelimited(hdev->dev,
-			"Timeout when waiting for MMU cache invalidation\n");
+	if (rc) {
+		dev_err_ratelimited(hdev->dev,
+					"MMU cache invalidation timeout\n");
+		hl_device_reset(hdev, true, false);
+	}
+
+	return rc;
 }
 
 int goya_send_heartbeat(struct hl_device *hdev)
diff --git a/drivers/misc/habanalabs/habanalabs.h b/drivers/misc/habanalabs/habanalabs.h
index 0f0691875298d..1ecdcf8b763a2 100644
--- a/drivers/misc/habanalabs/habanalabs.h
+++ b/drivers/misc/habanalabs/habanalabs.h
@@ -675,9 +675,9 @@ struct hl_asic_funcs {
 				u32 *size);
 	u64 (*read_pte)(struct hl_device *hdev, u64 addr);
 	void (*write_pte)(struct hl_device *hdev, u64 addr, u64 val);
-	void (*mmu_invalidate_cache)(struct hl_device *hdev, bool is_hard,
+	int (*mmu_invalidate_cache)(struct hl_device *hdev, bool is_hard,
 					u32 flags);
-	void (*mmu_invalidate_cache_range)(struct hl_device *hdev, bool is_hard,
+	int (*mmu_invalidate_cache_range)(struct hl_device *hdev, bool is_hard,
 			u32 asid, u64 va, u64 size);
 	int (*send_heartbeat)(struct hl_device *hdev);
 	void (*enable_clock_gating)(struct hl_device *hdev);
@@ -755,8 +755,8 @@ struct hl_va_range {
  *                      with huge pages.
  * @dram_va_range: holds available virtual addresses for DRAM mappings.
  * @mem_hash_lock: protects the mem_hash.
- * @mmu_lock: protects the MMU page tables. Any change to the PGT, modifing the
- *            MMU hash or walking the PGT requires talking this lock
+ * @mmu_lock: protects the MMU page tables. Any change to the PGT, modifying the
+ *            MMU hash or walking the PGT requires talking this lock.
  * @debugfs_list: node in debugfs list of contexts.
  * @cs_sequence: sequence number for CS. Value is assigned to a CS and passed
  *			to user so user could inquire about CS. It is used as
diff --git a/drivers/misc/habanalabs/memory.c b/drivers/misc/habanalabs/memory.c
index a72f766ca4701..4b8eed1ca5130 100644
--- a/drivers/misc/habanalabs/memory.c
+++ b/drivers/misc/habanalabs/memory.c
@@ -886,6 +886,7 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 
 		vm_type = (enum vm_type_t *) userptr;
 		hint_addr = args->map_host.hint_addr;
+		handle = phys_pg_pack->handle;
 	} else {
 		handle = lower_32_bits(args->map_device.handle);
 
@@ -954,10 +955,17 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 		goto map_err;
 	}
 
-	hdev->asic_funcs->mmu_invalidate_cache(hdev, false, *vm_type);
+	rc = hdev->asic_funcs->mmu_invalidate_cache(hdev, false, *vm_type);
 
 	mutex_unlock(&ctx->mmu_lock);
 
+	if (rc) {
+		dev_err(hdev->dev,
+			"mapping handle %u failed due to MMU cache invalidation\n",
+			handle);
+		goto map_err;
+	}
+
 	ret_vaddr += phys_pg_pack->offset;
 
 	hnode->ptr = vm_type;
@@ -1083,21 +1091,34 @@ static int unmap_device_va(struct hl_ctx *ctx, u64 vaddr, bool ctx_free)
 	 * at the loop end rather than for each iteration
 	 */
 	if (!ctx_free)
-		hdev->asic_funcs->mmu_invalidate_cache(hdev, true, *vm_type);
+		rc = hdev->asic_funcs->mmu_invalidate_cache(hdev, true,
+								*vm_type);
 
 	mutex_unlock(&ctx->mmu_lock);
 
 	/*
-	 * No point in maintaining the free VA block list if the context is
-	 * closing as the list will be freed anyway
+	 * If the context is closing we don't need to check for the MMU cache
+	 * invalidation return code and update the VA free list as in this flow
+	 * we invalidate the MMU cache outside of this unmap function and the VA
+	 * free list will be freed anyway.
 	 */
 	if (!ctx_free) {
-		rc = add_va_block(hdev, va_range, vaddr,
-					vaddr + phys_pg_pack->total_size - 1);
+		int tmp_rc;
+
 		if (rc)
+			dev_err(hdev->dev,
+				"unmapping vaddr 0x%llx failed due to MMU cache invalidation\n",
+				vaddr);
+
+		tmp_rc = add_va_block(hdev, va_range, vaddr,
+					vaddr + phys_pg_pack->total_size - 1);
+		if (tmp_rc) {
 			dev_warn(hdev->dev,
 					"add va block failed for vaddr: 0x%llx\n",
 					vaddr);
+			if (!rc)
+				rc = tmp_rc;
+		}
 	}
 
 	atomic_dec(&phys_pg_pack->mapping_cnt);
@@ -1108,7 +1129,7 @@ static int unmap_device_va(struct hl_ctx *ctx, u64 vaddr, bool ctx_free)
 		dma_unmap_host_va(hdev, userptr);
 	}
 
-	return 0;
+	return rc;
 
 mapping_cnt_err:
 	if (is_userptr)