aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/drivers/gpu/drm/i915/gt/intel_lrc.c
diff options
context:
space:
mode:
authorChris Wilson <chris@chris-wilson.co.uk>2020-01-28 20:43:15 +0000
committerChris Wilson <chris@chris-wilson.co.uk>2020-01-29 15:16:52 +0000
commit70a76a9b8e9d553c02deaf8503cd01a316016be0 (patch)
treee547bf99e3a9f96c0739357696410aeac962b8be /drivers/gpu/drm/i915/gt/intel_lrc.c
parentdrm/i915/execlist: Mark up racy read of execlists->pending[0] (diff)
downloadwireguard-linux-70a76a9b8e9d553c02deaf8503cd01a316016be0.tar.xz
wireguard-linux-70a76a9b8e9d553c02deaf8503cd01a316016be0.zip
drm/i915/gt: Hook up CS_MASTER_ERROR_INTERRUPT
Now that we have offline error capture and can reset an engine from inside an atomic context while also preserving the GPU state for post-mortem analysis, it is time to handle error interrupts thrown by the command parser. This provides a much, much faster mechanism for us to detect known problems than using heartbeats/hangchecks, and also provides a mechanism for when those are disabled. However, it is limited to problems the HW can detect in the CS and so not a complete solution for detecting lockups. Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20200128204318.4182039-2-chris@chris-wilson.co.uk
Diffstat (limited to 'drivers/gpu/drm/i915/gt/intel_lrc.c')
-rw-r--r--drivers/gpu/drm/i915/gt/intel_lrc.c81
1 files changed, 66 insertions, 15 deletions
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c
index 677efdc7c301..9eabaee93b02 100644
--- a/drivers/gpu/drm/i915/gt/intel_lrc.c
+++ b/drivers/gpu/drm/i915/gt/intel_lrc.c
@@ -2613,13 +2613,13 @@ static bool execlists_capture(struct intel_engine_cs *engine)
if (!cap)
return true;
+ spin_lock_irq(&engine->active.lock);
cap->rq = execlists_active(&engine->execlists);
- GEM_BUG_ON(!cap->rq);
-
- rcu_read_lock();
- cap->rq = active_request(cap->rq->context->timeline, cap->rq);
- cap->rq = i915_request_get_rcu(cap->rq);
- rcu_read_unlock();
+ if (cap->rq) {
+ cap->rq = active_request(cap->rq->context->timeline, cap->rq);
+ cap->rq = i915_request_get_rcu(cap->rq);
+ }
+ spin_unlock_irq(&engine->active.lock);
if (!cap->rq)
goto err_free;
@@ -2658,27 +2658,25 @@ err_free:
return false;
}
-static noinline void preempt_reset(struct intel_engine_cs *engine)
+static void execlists_reset(struct intel_engine_cs *engine, const char *msg)
{
const unsigned int bit = I915_RESET_ENGINE + engine->id;
unsigned long *lock = &engine->gt->reset.flags;
- if (i915_modparams.reset < 3)
+ if (!intel_has_reset_engine(engine->gt))
return;
if (test_and_set_bit(bit, lock))
return;
+ ENGINE_TRACE(engine, "reset for %s\n", msg);
+
/* Mark this tasklet as disabled to avoid waiting for it to complete */
tasklet_disable_nosync(&engine->execlists.tasklet);
- ENGINE_TRACE(engine, "preempt timeout %lu+%ums\n",
- READ_ONCE(engine->props.preempt_timeout_ms),
- jiffies_to_msecs(jiffies - engine->execlists.preempt.expires));
-
ring_set_paused(engine, 1); /* Freeze the current request in place */
if (execlists_capture(engine))
- intel_engine_reset(engine, "preemption time out");
+ intel_engine_reset(engine, msg);
else
ring_set_paused(engine, 0);
@@ -2709,6 +2707,13 @@ static void execlists_submission_tasklet(unsigned long data)
bool timeout = preempt_timeout(engine);
process_csb(engine);
+
+ if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
+ engine->execlists.error_interrupt = 0;
+ if (ENGINE_READ(engine, RING_ESR)) /* confirm the error */
+ execlists_reset(engine, "CS error");
+ }
+
if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
unsigned long flags;
@@ -2717,8 +2722,8 @@ static void execlists_submission_tasklet(unsigned long data)
spin_unlock_irqrestore(&engine->active.lock, flags);
/* Recheck after serialising with direct-submission */
- if (timeout && preempt_timeout(engine))
- preempt_reset(engine);
+ if (unlikely(timeout && preempt_timeout(engine)))
+ execlists_reset(engine, "preemption time out");
}
}
@@ -3335,6 +3340,49 @@ static int intel_init_workaround_bb(struct intel_engine_cs *engine)
return ret;
}
+static void enable_error_interrupt(struct intel_engine_cs *engine)
+{
+ u32 status;
+
+ engine->execlists.error_interrupt = 0;
+ ENGINE_WRITE(engine, RING_EMR, ~0u);
+ ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */
+
+ status = ENGINE_READ(engine, RING_ESR);
+ if (unlikely(status)) {
+ dev_err(engine->i915->drm.dev,
+ "engine '%s' resumed still in error: %08x\n",
+ engine->name, status);
+ __intel_gt_reset(engine->gt, engine->mask);
+ }
+
+ /*
+ * On current gen8+, we have 2 signals to play with
+ *
+ * - I915_ERROR_INSTUCTION (bit 0)
+ *
+ * Generate an error if the command parser encounters an invalid
+ * instruction
+ *
+ * This is a fatal error.
+ *
+ * - CP_PRIV (bit 2)
+ *
+ * Generate an error on privilege violation (where the CP replaces
+ * the instruction with a no-op). This also fires for writes into
+ * read-only scratch pages.
+ *
+ * This is a non-fatal error, parsing continues.
+ *
+ * * there are a few others defined for odd HW that we do not use
+ *
+ * Since CP_PRIV fires for cases where we have chosen to ignore the
+ * error (as the HW is validating and suppressing the mistakes), we
+ * only unmask the instruction error bit.
+ */
+ ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION);
+}
+
static void enable_execlists(struct intel_engine_cs *engine)
{
u32 mode;
@@ -3356,6 +3404,8 @@ static void enable_execlists(struct intel_engine_cs *engine)
i915_ggtt_offset(engine->status_page.vma));
ENGINE_POSTING_READ(engine, RING_HWS_PGA);
+ enable_error_interrupt(engine);
+
engine->context_tag = 0;
}
@@ -4282,6 +4332,7 @@ logical_ring_default_irqs(struct intel_engine_cs *engine)
engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
+ engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift;
}
static void rcs_submission_override(struct intel_engine_cs *engine)