diff options
author | Chris Wilson <chris@chris-wilson.co.uk> | 2020-01-28 20:43:15 +0000 |
---|---|---|
committer | Chris Wilson <chris@chris-wilson.co.uk> | 2020-01-29 15:16:52 +0000 |
commit | 70a76a9b8e9d553c02deaf8503cd01a316016be0 (patch) | |
tree | e547bf99e3a9f96c0739357696410aeac962b8be /drivers/gpu/drm/i915/gt/intel_lrc.c | |
parent | drm/i915/execlist: Mark up racy read of execlists->pending[0] (diff) | |
download | wireguard-linux-70a76a9b8e9d553c02deaf8503cd01a316016be0.tar.xz wireguard-linux-70a76a9b8e9d553c02deaf8503cd01a316016be0.zip |
drm/i915/gt: Hook up CS_MASTER_ERROR_INTERRUPT
Now that we have offline error capture and can reset an engine from
inside an atomic context while also preserving the GPU state for
post-mortem analysis, it is time to handle error interrupts thrown by
the command parser.
This provides a much, much faster mechanism for us to detect known
problems than using heartbeats/hangchecks, and also provides a mechanism
for when those are disabled. However, it is limited to problems the HW
can detect in the CS and so not a complete solution for detecting lockups.
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20200128204318.4182039-2-chris@chris-wilson.co.uk
Diffstat (limited to 'drivers/gpu/drm/i915/gt/intel_lrc.c')
-rw-r--r-- | drivers/gpu/drm/i915/gt/intel_lrc.c | 81 |
1 files changed, 66 insertions, 15 deletions
diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index 677efdc7c301..9eabaee93b02 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -2613,13 +2613,13 @@ static bool execlists_capture(struct intel_engine_cs *engine) if (!cap) return true; + spin_lock_irq(&engine->active.lock); cap->rq = execlists_active(&engine->execlists); - GEM_BUG_ON(!cap->rq); - - rcu_read_lock(); - cap->rq = active_request(cap->rq->context->timeline, cap->rq); - cap->rq = i915_request_get_rcu(cap->rq); - rcu_read_unlock(); + if (cap->rq) { + cap->rq = active_request(cap->rq->context->timeline, cap->rq); + cap->rq = i915_request_get_rcu(cap->rq); + } + spin_unlock_irq(&engine->active.lock); if (!cap->rq) goto err_free; @@ -2658,27 +2658,25 @@ err_free: return false; } -static noinline void preempt_reset(struct intel_engine_cs *engine) +static void execlists_reset(struct intel_engine_cs *engine, const char *msg) { const unsigned int bit = I915_RESET_ENGINE + engine->id; unsigned long *lock = &engine->gt->reset.flags; - if (i915_modparams.reset < 3) + if (!intel_has_reset_engine(engine->gt)) return; if (test_and_set_bit(bit, lock)) return; + ENGINE_TRACE(engine, "reset for %s\n", msg); + /* Mark this tasklet as disabled to avoid waiting for it to complete */ tasklet_disable_nosync(&engine->execlists.tasklet); - ENGINE_TRACE(engine, "preempt timeout %lu+%ums\n", - READ_ONCE(engine->props.preempt_timeout_ms), - jiffies_to_msecs(jiffies - engine->execlists.preempt.expires)); - ring_set_paused(engine, 1); /* Freeze the current request in place */ if (execlists_capture(engine)) - intel_engine_reset(engine, "preemption time out"); + intel_engine_reset(engine, msg); else ring_set_paused(engine, 0); @@ -2709,6 +2707,13 @@ static void execlists_submission_tasklet(unsigned long data) bool timeout = preempt_timeout(engine); process_csb(engine); + + if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) { + engine->execlists.error_interrupt = 0; + if (ENGINE_READ(engine, RING_ESR)) /* confirm the error */ + execlists_reset(engine, "CS error"); + } + if (!READ_ONCE(engine->execlists.pending[0]) || timeout) { unsigned long flags; @@ -2717,8 +2722,8 @@ static void execlists_submission_tasklet(unsigned long data) spin_unlock_irqrestore(&engine->active.lock, flags); /* Recheck after serialising with direct-submission */ - if (timeout && preempt_timeout(engine)) - preempt_reset(engine); + if (unlikely(timeout && preempt_timeout(engine))) + execlists_reset(engine, "preemption time out"); } } @@ -3335,6 +3340,49 @@ static int intel_init_workaround_bb(struct intel_engine_cs *engine) return ret; } +static void enable_error_interrupt(struct intel_engine_cs *engine) +{ + u32 status; + + engine->execlists.error_interrupt = 0; + ENGINE_WRITE(engine, RING_EMR, ~0u); + ENGINE_WRITE(engine, RING_EIR, ~0u); /* clear all existing errors */ + + status = ENGINE_READ(engine, RING_ESR); + if (unlikely(status)) { + dev_err(engine->i915->drm.dev, + "engine '%s' resumed still in error: %08x\n", + engine->name, status); + __intel_gt_reset(engine->gt, engine->mask); + } + + /* + * On current gen8+, we have 2 signals to play with + * + * - I915_ERROR_INSTUCTION (bit 0) + * + * Generate an error if the command parser encounters an invalid + * instruction + * + * This is a fatal error. + * + * - CP_PRIV (bit 2) + * + * Generate an error on privilege violation (where the CP replaces + * the instruction with a no-op). This also fires for writes into + * read-only scratch pages. + * + * This is a non-fatal error, parsing continues. + * + * * there are a few others defined for odd HW that we do not use + * + * Since CP_PRIV fires for cases where we have chosen to ignore the + * error (as the HW is validating and suppressing the mistakes), we + * only unmask the instruction error bit. + */ + ENGINE_WRITE(engine, RING_EMR, ~I915_ERROR_INSTRUCTION); +} + static void enable_execlists(struct intel_engine_cs *engine) { u32 mode; @@ -3356,6 +3404,8 @@ static void enable_execlists(struct intel_engine_cs *engine) i915_ggtt_offset(engine->status_page.vma)); ENGINE_POSTING_READ(engine, RING_HWS_PGA); + enable_error_interrupt(engine); + engine->context_tag = 0; } @@ -4282,6 +4332,7 @@ logical_ring_default_irqs(struct intel_engine_cs *engine) engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift; engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift; + engine->irq_keep_mask |= GT_CS_MASTER_ERROR_INTERRUPT << shift; } static void rcs_submission_override(struct intel_engine_cs *engine) |