drm/i915: Remove GPU reset dependence on struct_mutex

Now that the submission backends are controlled via their own spinlocks, with a wave of a magic wand we can lift the struct_mutex requirement around GPU reset. That is we allow the submission frontend (userspace) to keep on submitting while we process the GPU reset as we can suspend the backend independently. The major change is around the backoff/handoff strategy for performing the reset. With no mutex deadlock, we no longer have to coordinate with any waiter, and just perform the reset immediately. Testcase: igt/gem_mmap_gtt/hang # regresses Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20190125132230.22221-3-chris@chris-wilson.co.uk
mariux64 · Jan 25, 2019 · eb8d0f5 · eb8d0f5
1 parent fe62365
commit eb8d0f5
Showing 20 changed files with 404 additions and 537 deletions.
diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -1284,8 +1284,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 		seq_puts(m, "Wedged\n");
 	if (test_bit(I915_RESET_BACKOFF, &dev_priv->gpu_error.flags))
 		seq_puts(m, "Reset in progress: struct_mutex backoff\n");
-	if (test_bit(I915_RESET_HANDOFF, &dev_priv->gpu_error.flags))
-		seq_puts(m, "Reset in progress: reset handoff to waiter\n");
 	if (waitqueue_active(&dev_priv->gpu_error.wait_queue))
 		seq_puts(m, "Waiter holding struct mutex\n");
 	if (waitqueue_active(&dev_priv->gpu_error.reset_queue))
@@ -1321,15 +1319,15 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 		struct rb_node *rb;
 
 		seq_printf(m, "%s:\n", engine->name);
-		seq_printf(m, "\tseqno = %x [current %x, last %x]\n",
+		seq_printf(m, "\tseqno = %x [current %x, last %x], %dms ago\n",
 			   engine->hangcheck.seqno, seqno[id],
-			   intel_engine_last_submit(engine));
-		seq_printf(m, "\twaiters? %s, fake irq active? %s, stalled? %s, wedged? %s\n",
+			   intel_engine_last_submit(engine),
+			   jiffies_to_msecs(jiffies -
+					    engine->hangcheck.action_timestamp));
+		seq_printf(m, "\twaiters? %s, fake irq active? %s\n",
 			   yesno(intel_engine_has_waiter(engine)),
 			   yesno(test_bit(engine->id,
-					  &dev_priv->gpu_error.missed_irq_rings)),
-			   yesno(engine->hangcheck.stalled),
-			   yesno(engine->hangcheck.wedged));
+					  &dev_priv->gpu_error.missed_irq_rings)));
 
 		spin_lock_irq(&b->rb_lock);
 		for (rb = rb_first(&b->waiters); rb; rb = rb_next(rb)) {
@@ -1343,11 +1341,6 @@ static int i915_hangcheck_info(struct seq_file *m, void *unused)
 		seq_printf(m, "\tACTHD = 0x%08llx [current 0x%08llx]\n",
 			   (long long)engine->hangcheck.acthd,
 			   (long long)acthd[id]);
-		seq_printf(m, "\taction = %s(%d) %d ms ago\n",
-			   hangcheck_action_to_str(engine->hangcheck.action),
-			   engine->hangcheck.action,
-			   jiffies_to_msecs(jiffies -
-					    engine->hangcheck.action_timestamp));
 
 		if (engine->id == RCS) {
 			seq_puts(m, "\tinstdone read =\n");
@@ -3911,8 +3904,6 @@ static int
 i915_wedged_set(void *data, u64 val)
 {
 	struct drm_i915_private *i915 = data;
-	struct intel_engine_cs *engine;
-	unsigned int tmp;
 
 	/*
 	 * There is no safeguard against this debugfs entry colliding
@@ -3925,18 +3916,8 @@ i915_wedged_set(void *data, u64 val)
 	if (i915_reset_backoff(&i915->gpu_error))
 		return -EAGAIN;
 
-	for_each_engine_masked(engine, i915, val, tmp) {
-		engine->hangcheck.seqno = intel_engine_get_seqno(engine);
-		engine->hangcheck.stalled = true;
-	}
-
 	i915_handle_error(i915, val, I915_ERROR_CAPTURE,
 			  "Manually set wedged engine mask = %llx", val);
-
-	wait_on_bit(&i915->gpu_error.flags,
-		    I915_RESET_HANDOFF,
-		    TASK_UNINTERRUPTIBLE);
-
 	return 0;
 }
 
@@ -4091,13 +4072,8 @@ i915_drop_caches_set(void *data, u64 val)
 		mutex_unlock(&i915->drm.struct_mutex);
 	}
 
-	if (val & DROP_RESET_ACTIVE &&
-	    i915_terminally_wedged(&i915->gpu_error)) {
+	if (val & DROP_RESET_ACTIVE && i915_terminally_wedged(&i915->gpu_error))
 		i915_handle_error(i915, ALL_ENGINES, 0, NULL);
-		wait_on_bit(&i915->gpu_error.flags,
-			    I915_RESET_HANDOFF,
-			    TASK_UNINTERRUPTIBLE);
-	}
 
 	fs_reclaim_acquire(GFP_KERNEL);
 	if (val & DROP_BOUND)

diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
@@ -3001,11 +3001,6 @@ static inline bool i915_reset_backoff(struct i915_gpu_error *error)
 	return unlikely(test_bit(I915_RESET_BACKOFF, &error->flags));
 }
 
-static inline bool i915_reset_handoff(struct i915_gpu_error *error)
-{
-	return unlikely(test_bit(I915_RESET_HANDOFF, &error->flags));
-}
-
 static inline bool i915_terminally_wedged(struct i915_gpu_error *error)
 {
 	return unlikely(test_bit(I915_WEDGED, &error->flags));

diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c
@@ -657,11 +657,6 @@ i915_gem_object_wait(struct drm_i915_gem_object *obj,
 		     struct intel_rps_client *rps_client)
 {
 	might_sleep();
-#if IS_ENABLED(CONFIG_LOCKDEP)
-	GEM_BUG_ON(debug_locks &&
-		   !!lockdep_is_held(&obj->base.dev->struct_mutex) !=
-		   !!(flags & I915_WAIT_LOCKED));
-#endif
 	GEM_BUG_ON(timeout < 0);
 
 	timeout = i915_gem_object_wait_reservation(obj->resv,
@@ -4493,8 +4488,6 @@ void i915_gem_sanitize(struct drm_i915_private *i915)
 
 	GEM_TRACE("\n");
 
-	mutex_lock(&i915->drm.struct_mutex);
-
 	wakeref = intel_runtime_pm_get(i915);
 	intel_uncore_forcewake_get(i915, FORCEWAKE_ALL);
 
@@ -4520,6 +4513,7 @@ void i915_gem_sanitize(struct drm_i915_private *i915)
 	intel_uncore_forcewake_put(i915, FORCEWAKE_ALL);
 	intel_runtime_pm_put(i915, wakeref);
 
+	mutex_lock(&i915->drm.struct_mutex);
 	i915_gem_contexts_lost(i915);
 	mutex_unlock(&i915->drm.struct_mutex);
 }
@@ -4534,6 +4528,8 @@ int i915_gem_suspend(struct drm_i915_private *i915)
 	wakeref = intel_runtime_pm_get(i915);
 	intel_suspend_gt_powersave(i915);
 
+	flush_workqueue(i915->wq);
+
 	mutex_lock(&i915->drm.struct_mutex);
 
 	/*
@@ -4563,18 +4559,18 @@ int i915_gem_suspend(struct drm_i915_private *i915)
 	i915_retire_requests(i915); /* ensure we flush after wedging */
 
 	mutex_unlock(&i915->drm.struct_mutex);
+	i915_reset_flush(i915);
 
-	intel_uc_suspend(i915);
-
-	cancel_delayed_work_sync(&i915->gpu_error.hangcheck_work);
-	cancel_delayed_work_sync(&i915->gt.retire_work);
+	drain_delayed_work(&i915->gt.retire_work);
 
 	/*
 	 * As the idle_work is rearming if it detects a race, play safe and
 	 * repeat the flush until it is definitely idle.
 	 */
 	drain_delayed_work(&i915->gt.idle_work);
 
+	intel_uc_suspend(i915);
+
 	/*
 	 * Assert that we successfully flushed all the work and
 	 * reset the GPU back to its idle, low power state.

diff --git a/drivers/gpu/drm/i915/i915_gem_fence_reg.h b/drivers/gpu/drm/i915/i915_gem_fence_reg.h
@@ -50,4 +50,3 @@ struct drm_i915_fence_reg {
 };
 
 #endif
-
diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.h b/drivers/gpu/drm/i915/i915_gem_gtt.h
@@ -39,6 +39,7 @@
 #include <linux/pagevec.h>
 
 #include "i915_request.h"
+#include "i915_reset.h"
 #include "i915_selftest.h"
 #include "i915_timeline.h"
 

diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -533,10 +533,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
 	err_printf(m, "  waiting: %s\n", yesno(ee->waiting));
 	err_printf(m, "  ring->head: 0x%08x\n", ee->cpu_ring_head);
 	err_printf(m, "  ring->tail: 0x%08x\n", ee->cpu_ring_tail);
-	err_printf(m, "  hangcheck stall: %s\n", yesno(ee->hangcheck_stalled));
-	err_printf(m, "  hangcheck action: %s\n",
-		   hangcheck_action_to_str(ee->hangcheck_action));
-	err_printf(m, "  hangcheck action timestamp: %dms (%lu%s)\n",
+	err_printf(m, "  hangcheck timestamp: %dms (%lu%s)\n",
 		   jiffies_to_msecs(ee->hangcheck_timestamp - epoch),
 		   ee->hangcheck_timestamp,
 		   ee->hangcheck_timestamp == epoch ? "; epoch" : "");
@@ -684,15 +681,15 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
 		   jiffies_to_msecs(error->capture - error->epoch));
 
 	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
-		if (error->engine[i].hangcheck_stalled &&
-		    error->engine[i].context.pid) {
-			err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n",
-				   engine_name(m->i915, i),
-				   error->engine[i].context.comm,
-				   error->engine[i].context.pid,
-				   error->engine[i].context.ban_score,
-				   bannable(&error->engine[i].context));
-		}
+		if (!error->engine[i].context.pid)
+			continue;
+
+		err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n",
+			   engine_name(m->i915, i),
+			   error->engine[i].context.comm,
+			   error->engine[i].context.pid,
+			   error->engine[i].context.ban_score,
+			   bannable(&error->engine[i].context));
 	}
 	err_printf(m, "Reset count: %u\n", error->reset_count);
 	err_printf(m, "Suspend count: %u\n", error->suspend_count);
@@ -1144,7 +1141,8 @@ static u32 capture_error_bo(struct drm_i915_error_buffer *err,
 	return i;
 }
 
-/* Generate a semi-unique error code. The code is not meant to have meaning, The
+/*
+ * Generate a semi-unique error code. The code is not meant to have meaning, The
  * code's only purpose is to try to prevent false duplicated bug reports by
  * grossly estimating a GPU error state.
  *
@@ -1153,29 +1151,23 @@ static u32 capture_error_bo(struct drm_i915_error_buffer *err,
  *
  * It's only a small step better than a random number in its current form.
  */
-static u32 i915_error_generate_code(struct drm_i915_private *dev_priv,
-				    struct i915_gpu_state *error,
-				    int *engine_id)
+static u32 i915_error_generate_code(struct i915_gpu_state *error,
+				    unsigned long engine_mask)
 {
-	u32 error_code = 0;
-	int i;
-
-	/* IPEHR would be an ideal way to detect errors, as it's the gross
+	/*
+	 * IPEHR would be an ideal way to detect errors, as it's the gross
 	 * measure of "the command that hung." However, has some very common
 	 * synchronization commands which almost always appear in the case
 	 * strictly a client bug. Use instdone to differentiate those some.
 	 */
-	for (i = 0; i < I915_NUM_ENGINES; i++) {
-		if (error->engine[i].hangcheck_stalled) {
-			if (engine_id)
-				*engine_id = i;
+	if (engine_mask) {
+		struct drm_i915_error_engine *ee =
+			&error->engine[ffs(engine_mask)];
 
-			return error->engine[i].ipehr ^
-			       error->engine[i].instdone.instdone;
-		}
+		return ee->ipehr ^ ee->instdone.instdone;
 	}
 
-	return error_code;
+	return 0;
 }
 
 static void gem_record_fences(struct i915_gpu_state *error)
@@ -1338,9 +1330,8 @@ static void error_record_engine_registers(struct i915_gpu_state *error,
 	}
 
 	ee->idle = intel_engine_is_idle(engine);
-	ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
-	ee->hangcheck_action = engine->hangcheck.action;
-	ee->hangcheck_stalled = engine->hangcheck.stalled;
+	if (!ee->idle)
+		ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
 	ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error,
 						  engine);
 
@@ -1783,31 +1774,35 @@ static void capture_reg_state(struct i915_gpu_state *error)
 	error->pgtbl_er = I915_READ(PGTBL_ER);
 }
 
-static void i915_error_capture_msg(struct drm_i915_private *dev_priv,
-				   struct i915_gpu_state *error,
-				   u32 engine_mask,
-				   const char *error_msg)
+static const char *
+error_msg(struct i915_gpu_state *error, unsigned long engines, const char *msg)
 {
-	u32 ecode;
-	int engine_id = -1, len;
+	int len;
+	int i;
 
-	ecode = i915_error_generate_code(dev_priv, error, &engine_id);
+	for (i = 0; i < ARRAY_SIZE(error->engine); i++)
+		if (!error->engine[i].context.pid)
+			engines &= ~BIT(i);
 
 	len = scnprintf(error->error_msg, sizeof(error->error_msg),
-			"GPU HANG: ecode %d:%d:0x%08x",
-			INTEL_GEN(dev_priv), engine_id, ecode);
-
-	if (engine_id != -1 && error->engine[engine_id].context.pid)
+			"GPU HANG: ecode %d:%lx:0x%08x",
+			INTEL_GEN(error->i915), engines,
+			i915_error_generate_code(error, engines));
+	if (engines) {
+		/* Just show the first executing process, more is confusing */
+		i = ffs(engines);
 		len += scnprintf(error->error_msg + len,
 				 sizeof(error->error_msg) - len,
 				 ", in %s [%d]",
-				 error->engine[engine_id].context.comm,
-				 error->engine[engine_id].context.pid);
+				 error->engine[i].context.comm,
+				 error->engine[i].context.pid);
+	}
+	if (msg)
+		len += scnprintf(error->error_msg + len,
+				 sizeof(error->error_msg) - len,
+				 ", %s", msg);
 
-	scnprintf(error->error_msg + len, sizeof(error->error_msg) - len,
-		  ", reason: %s, action: %s",
-		  error_msg,
-		  engine_mask ? "reset" : "continue");
+	return error->error_msg;
 }
 
 static void capture_gen_state(struct i915_gpu_state *error)
@@ -1847,7 +1842,7 @@ static unsigned long capture_find_epoch(const struct i915_gpu_state *error)
 	for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
 		const struct drm_i915_error_engine *ee = &error->engine[i];
 
-		if (ee->hangcheck_stalled &&
+		if (ee->hangcheck_timestamp &&
 		    time_before(ee->hangcheck_timestamp, epoch))
 			epoch = ee->hangcheck_timestamp;
 	}
@@ -1921,16 +1916,16 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
  * i915_capture_error_state - capture an error record for later analysis
  * @i915: i915 device
  * @engine_mask: the mask of engines triggering the hang
- * @error_msg: a message to insert into the error capture header
+ * @msg: a message to insert into the error capture header
  *
  * Should be called when an error is detected (either a hang or an error
  * interrupt) to capture error state from the time of the error.  Fills
  * out a structure which becomes available in debugfs for user level tools
  * to pick up.
  */
 void i915_capture_error_state(struct drm_i915_private *i915,
-			      u32 engine_mask,
-			      const char *error_msg)
+			      unsigned long engine_mask,
+			      const char *msg)
 {
 	static bool warned;
 	struct i915_gpu_state *error;
@@ -1946,8 +1941,7 @@ void i915_capture_error_state(struct drm_i915_private *i915,
 	if (IS_ERR(error))
 		return;
 
-	i915_error_capture_msg(i915, error, engine_mask, error_msg);
-	DRM_INFO("%s\n", error->error_msg);
+	dev_info(i915->drm.dev, "%s\n", error_msg(error, engine_mask, msg));
 
 	if (!error->simulated) {
 		spin_lock_irqsave(&i915->gpu_error.lock, flags);