aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/i915/selftests/intel_hangcheck.c')
-rw-r--r--drivers/gpu/drm/i915/selftests/intel_hangcheck.c417
1 files changed, 304 insertions, 113 deletions
diff --git a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
index 40efbed611de..7b6f3bea9ef8 100644
--- a/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
+++ b/drivers/gpu/drm/i915/selftests/intel_hangcheck.c
@@ -103,52 +103,87 @@ static u64 hws_address(const struct i915_vma *hws,
return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
}
-static int emit_recurse_batch(struct hang *h,
- struct i915_request *rq)
+static int move_to_active(struct i915_vma *vma,
+ struct i915_request *rq,
+ unsigned int flags)
+{
+ int err;
+
+ err = i915_vma_move_to_active(vma, rq, flags);
+ if (err)
+ return err;
+
+ if (!i915_gem_object_has_active_reference(vma->obj)) {
+ i915_gem_object_get(vma->obj);
+ i915_gem_object_set_active_reference(vma->obj);
+ }
+
+ return 0;
+}
+
+static struct i915_request *
+hang_create_request(struct hang *h, struct intel_engine_cs *engine)
{
struct drm_i915_private *i915 = h->i915;
struct i915_address_space *vm =
- rq->gem_context->ppgtt ?
- &rq->gem_context->ppgtt->vm :
- &i915->ggtt.vm;
+ h->ctx->ppgtt ? &h->ctx->ppgtt->vm : &i915->ggtt.vm;
+ struct i915_request *rq = NULL;
struct i915_vma *hws, *vma;
unsigned int flags;
u32 *batch;
int err;
+ if (i915_gem_object_is_active(h->obj)) {
+ struct drm_i915_gem_object *obj;
+ void *vaddr;
+
+ obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE);
+ if (IS_ERR(obj))
+ return ERR_CAST(obj);
+
+ vaddr = i915_gem_object_pin_map(obj,
+ i915_coherent_map_type(h->i915));
+ if (IS_ERR(vaddr)) {
+ i915_gem_object_put(obj);
+ return ERR_CAST(vaddr);
+ }
+
+ i915_gem_object_unpin_map(h->obj);
+ i915_gem_object_put(h->obj);
+
+ h->obj = obj;
+ h->batch = vaddr;
+ }
+
vma = i915_vma_instance(h->obj, vm, NULL);
if (IS_ERR(vma))
- return PTR_ERR(vma);
+ return ERR_CAST(vma);
hws = i915_vma_instance(h->hws, vm, NULL);
if (IS_ERR(hws))
- return PTR_ERR(hws);
+ return ERR_CAST(hws);
err = i915_vma_pin(vma, 0, 0, PIN_USER);
if (err)
- return err;
+ return ERR_PTR(err);
err = i915_vma_pin(hws, 0, 0, PIN_USER);
if (err)
goto unpin_vma;
- err = i915_vma_move_to_active(vma, rq, 0);
- if (err)
+ rq = i915_request_alloc(engine, h->ctx);
+ if (IS_ERR(rq)) {
+ err = PTR_ERR(rq);
goto unpin_hws;
-
- if (!i915_gem_object_has_active_reference(vma->obj)) {
- i915_gem_object_get(vma->obj);
- i915_gem_object_set_active_reference(vma->obj);
}
- err = i915_vma_move_to_active(hws, rq, 0);
+ err = move_to_active(vma, rq, 0);
if (err)
- goto unpin_hws;
+ goto cancel_rq;
- if (!i915_gem_object_has_active_reference(hws->obj)) {
- i915_gem_object_get(hws->obj);
- i915_gem_object_set_active_reference(hws->obj);
- }
+ err = move_to_active(hws, rq, 0);
+ if (err)
+ goto cancel_rq;
batch = h->batch;
if (INTEL_GEN(i915) >= 8) {
@@ -213,52 +248,16 @@ static int emit_recurse_batch(struct hang *h,
err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
+cancel_rq:
+ if (err) {
+ i915_request_skip(rq, err);
+ i915_request_add(rq);
+ }
unpin_hws:
i915_vma_unpin(hws);
unpin_vma:
i915_vma_unpin(vma);
- return err;
-}
-
-static struct i915_request *
-hang_create_request(struct hang *h, struct intel_engine_cs *engine)
-{
- struct i915_request *rq;
- int err;
-
- if (i915_gem_object_is_active(h->obj)) {
- struct drm_i915_gem_object *obj;
- void *vaddr;
-
- obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE);
- if (IS_ERR(obj))
- return ERR_CAST(obj);
-
- vaddr = i915_gem_object_pin_map(obj,
- i915_coherent_map_type(h->i915));
- if (IS_ERR(vaddr)) {
- i915_gem_object_put(obj);
- return ERR_CAST(vaddr);
- }
-
- i915_gem_object_unpin_map(h->obj);
- i915_gem_object_put(h->obj);
-
- h->obj = obj;
- h->batch = vaddr;
- }
-
- rq = i915_request_alloc(engine, h->ctx);
- if (IS_ERR(rq))
- return rq;
-
- err = emit_recurse_batch(h, rq);
- if (err) {
- i915_request_add(rq);
- return ERR_PTR(err);
- }
-
- return rq;
+ return err ? ERR_PTR(err) : rq;
}
static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
@@ -364,9 +363,7 @@ static int igt_global_reset(void *arg)
/* Check that we can issue a global GPU reset */
igt_global_reset_lock(i915);
- set_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags);
- mutex_lock(&i915->drm.struct_mutex);
reset_count = i915_reset_count(&i915->gpu_error);
i915_reset(i915, ALL_ENGINES, NULL);
@@ -375,9 +372,7 @@ static int igt_global_reset(void *arg)
pr_err("No GPU reset recorded!\n");
err = -EINVAL;
}
- mutex_unlock(&i915->drm.struct_mutex);
- GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
igt_global_reset_unlock(i915);
if (i915_terminally_wedged(&i915->gpu_error))
@@ -386,6 +381,29 @@ static int igt_global_reset(void *arg)
return err;
}
+static int igt_wedged_reset(void *arg)
+{
+ struct drm_i915_private *i915 = arg;
+ intel_wakeref_t wakeref;
+
+ /* Check that we can recover a wedged device with a GPU reset */
+
+ igt_global_reset_lock(i915);
+ wakeref = intel_runtime_pm_get(i915);
+
+ i915_gem_set_wedged(i915);
+
+ mutex_lock(&i915->drm.struct_mutex);
+ GEM_BUG_ON(!i915_terminally_wedged(&i915->gpu_error));
+ i915_reset(i915, ALL_ENGINES, NULL);
+ mutex_unlock(&i915->drm.struct_mutex);
+
+ intel_runtime_pm_put(i915, wakeref);
+ igt_global_reset_unlock(i915);
+
+ return i915_terminally_wedged(&i915->gpu_error) ? -EIO : 0;
+}
+
static bool wait_for_idle(struct intel_engine_cs *engine)
{
return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
@@ -431,8 +449,6 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
do {
- u32 seqno = intel_engine_get_seqno(engine);
-
if (active) {
struct i915_request *rq;
@@ -451,7 +467,7 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
if (!wait_until_running(&h, rq)) {
struct drm_printer p = drm_info_printer(i915->drm.dev);
- pr_err("%s: Failed to start request %x, at %x\n",
+ pr_err("%s: Failed to start request %llx, at %x\n",
__func__, rq->fence.seqno, hws_seqno(&h, rq));
intel_engine_dump(engine, &p,
"%s\n", engine->name);
@@ -461,8 +477,6 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
break;
}
- GEM_BUG_ON(!rq->global_seqno);
- seqno = rq->global_seqno - 1;
i915_request_put(rq);
}
@@ -478,16 +492,15 @@ static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
break;
}
- reset_engine_count += active;
if (i915_reset_engine_count(&i915->gpu_error, engine) !=
- reset_engine_count) {
- pr_err("%s engine reset %srecorded!\n",
- engine->name, active ? "not " : "");
+ ++reset_engine_count) {
+ pr_err("%s engine reset not recorded!\n",
+ engine->name);
err = -EINVAL;
break;
}
- if (!wait_for_idle(engine)) {
+ if (!i915_reset_flush(i915)) {
struct drm_printer p =
drm_info_printer(i915->drm.dev);
@@ -552,7 +565,7 @@ static int active_request_put(struct i915_request *rq)
return 0;
if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
- GEM_TRACE("%s timed out waiting for completion of fence %llx:%d, seqno %d.\n",
+ GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld, seqno %d.\n",
rq->engine->name,
rq->fence.context,
rq->fence.seqno,
@@ -710,7 +723,6 @@ static int __igt_reset_engines(struct drm_i915_private *i915,
set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
do {
- u32 seqno = intel_engine_get_seqno(engine);
struct i915_request *rq = NULL;
if (flags & TEST_ACTIVE) {
@@ -729,7 +741,7 @@ static int __igt_reset_engines(struct drm_i915_private *i915,
if (!wait_until_running(&h, rq)) {
struct drm_printer p = drm_info_printer(i915->drm.dev);
- pr_err("%s: Failed to start request %x, at %x\n",
+ pr_err("%s: Failed to start request %llx, at %x\n",
__func__, rq->fence.seqno, hws_seqno(&h, rq));
intel_engine_dump(engine, &p,
"%s\n", engine->name);
@@ -738,9 +750,6 @@ static int __igt_reset_engines(struct drm_i915_private *i915,
err = -EIO;
break;
}
-
- GEM_BUG_ON(!rq->global_seqno);
- seqno = rq->global_seqno - 1;
}
err = i915_reset_engine(engine, NULL);
@@ -777,10 +786,9 @@ static int __igt_reset_engines(struct drm_i915_private *i915,
reported = i915_reset_engine_count(&i915->gpu_error, engine);
reported -= threads[engine->id].resets;
- if (reported != (flags & TEST_ACTIVE ? count : 0)) {
- pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu, expected %lu reported\n",
- engine->name, test_name, count, reported,
- (flags & TEST_ACTIVE ? count : 0));
+ if (reported != count) {
+ pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
+ engine->name, test_name, count, reported);
if (!err)
err = -EINVAL;
}
@@ -879,20 +887,13 @@ static int igt_reset_engines(void *arg)
return 0;
}
-static u32 fake_hangcheck(struct i915_request *rq, u32 mask)
+static u32 fake_hangcheck(struct drm_i915_private *i915, u32 mask)
{
- struct i915_gpu_error *error = &rq->i915->gpu_error;
- u32 reset_count = i915_reset_count(error);
-
- error->stalled_mask = mask;
-
- /* set_bit() must be after we have setup the backchannel (mask) */
- smp_mb__before_atomic();
- set_bit(I915_RESET_HANDOFF, &error->flags);
+ u32 count = i915_reset_count(&i915->gpu_error);
- wake_up_all(&error->wait_queue);
+ i915_reset(i915, mask, NULL);
- return reset_count;
+ return count;
}
static int igt_reset_wait(void *arg)
@@ -928,7 +929,7 @@ static int igt_reset_wait(void *arg)
if (!wait_until_running(&h, rq)) {
struct drm_printer p = drm_info_printer(i915->drm.dev);
- pr_err("%s: Failed to start request %x, at %x\n",
+ pr_err("%s: Failed to start request %llx, at %x\n",
__func__, rq->fence.seqno, hws_seqno(&h, rq));
intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
@@ -938,7 +939,7 @@ static int igt_reset_wait(void *arg)
goto out_rq;
}
- reset_count = fake_hangcheck(rq, ALL_ENGINES);
+ reset_count = fake_hangcheck(i915, ALL_ENGINES);
timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10);
if (timeout < 0) {
@@ -948,7 +949,6 @@ static int igt_reset_wait(void *arg)
goto out_rq;
}
- GEM_BUG_ON(test_bit(I915_RESET_HANDOFF, &i915->gpu_error.flags));
if (i915_reset_count(&i915->gpu_error) == reset_count) {
pr_err("No GPU reset recorded!\n");
err = -EINVAL;
@@ -1107,7 +1107,7 @@ static int __igt_reset_evict_vma(struct drm_i915_private *i915,
if (!wait_until_running(&h, rq)) {
struct drm_printer p = drm_info_printer(i915->drm.dev);
- pr_err("%s: Failed to start request %x, at %x\n",
+ pr_err("%s: Failed to start request %llx, at %x\n",
__func__, rq->fence.seqno, hws_seqno(&h, rq));
intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
@@ -1127,7 +1127,7 @@ static int __igt_reset_evict_vma(struct drm_i915_private *i915,
wait_for_completion(&arg.completion);
- if (wait_for(waitqueue_active(&rq->execute), 10)) {
+ if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
struct drm_printer p = drm_info_printer(i915->drm.dev);
pr_err("igt/evict_vma kthread did not wait\n");
@@ -1138,7 +1138,7 @@ static int __igt_reset_evict_vma(struct drm_i915_private *i915,
}
out_reset:
- fake_hangcheck(rq, intel_engine_flag(rq->engine));
+ fake_hangcheck(rq->i915, intel_engine_flag(rq->engine));
if (tsk) {
struct igt_wedge_me w;
@@ -1302,7 +1302,7 @@ static int igt_reset_queue(void *arg)
if (!wait_until_running(&h, prev)) {
struct drm_printer p = drm_info_printer(i915->drm.dev);
- pr_err("%s(%s): Failed to start request %x, at %x\n",
+ pr_err("%s(%s): Failed to start request %llx, at %x\n",
__func__, engine->name,
prev->fence.seqno, hws_seqno(&h, prev));
intel_engine_dump(engine, &p,
@@ -1317,12 +1317,7 @@ static int igt_reset_queue(void *arg)
goto fini;
}
- reset_count = fake_hangcheck(prev, ENGINE_MASK(id));
-
- i915_reset(i915, ENGINE_MASK(id), NULL);
-
- GEM_BUG_ON(test_bit(I915_RESET_HANDOFF,
- &i915->gpu_error.flags));
+ reset_count = fake_hangcheck(i915, ENGINE_MASK(id));
if (prev->fence.error != -EIO) {
pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
@@ -1413,7 +1408,7 @@ static int igt_handle_error(void *arg)
if (!wait_until_running(&h, rq)) {
struct drm_printer p = drm_info_printer(i915->drm.dev);
- pr_err("%s: Failed to start request %x, at %x\n",
+ pr_err("%s: Failed to start request %llx, at %x\n",
__func__, rq->fence.seqno, hws_seqno(&h, rq));
intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
@@ -1449,10 +1444,203 @@ err_unlock:
return err;
}
+static void __preempt_begin(void)
+{
+ preempt_disable();
+}
+
+static void __preempt_end(void)
+{
+ preempt_enable();
+}
+
+static void __softirq_begin(void)
+{
+ local_bh_disable();
+}
+
+static void __softirq_end(void)
+{
+ local_bh_enable();
+}
+
+static void __hardirq_begin(void)
+{
+ local_irq_disable();
+}
+
+static void __hardirq_end(void)
+{
+ local_irq_enable();
+}
+
+struct atomic_section {
+ const char *name;
+ void (*critical_section_begin)(void);
+ void (*critical_section_end)(void);
+};
+
+static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
+ const struct atomic_section *p,
+ const char *mode)
+{
+ struct tasklet_struct * const t = &engine->execlists.tasklet;
+ int err;
+
+ GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
+ engine->name, mode, p->name);
+
+ tasklet_disable_nosync(t);
+ p->critical_section_begin();
+
+ err = i915_reset_engine(engine, NULL);
+
+ p->critical_section_end();
+ tasklet_enable(t);
+
+ if (err)
+ pr_err("i915_reset_engine(%s:%s) failed under %s\n",
+ engine->name, mode, p->name);
+
+ return err;
+}
+
+static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
+ const struct atomic_section *p)
+{
+ struct drm_i915_private *i915 = engine->i915;
+ struct i915_request *rq;
+ struct hang h;
+ int err;
+
+ err = __igt_atomic_reset_engine(engine, p, "idle");
+ if (err)
+ return err;
+
+ err = hang_init(&h, i915);
+ if (err)
+ return err;
+
+ rq = hang_create_request(&h, engine);
+ if (IS_ERR(rq)) {
+ err = PTR_ERR(rq);
+ goto out;
+ }
+
+ i915_request_get(rq);
+ i915_request_add(rq);
+
+ if (wait_until_running(&h, rq)) {
+ err = __igt_atomic_reset_engine(engine, p, "active");
+ } else {
+ pr_err("%s(%s): Failed to start request %llx, at %x\n",
+ __func__, engine->name,
+ rq->fence.seqno, hws_seqno(&h, rq));
+ i915_gem_set_wedged(i915);
+ err = -EIO;
+ }
+
+ if (err == 0) {
+ struct igt_wedge_me w;
+
+ igt_wedge_on_timeout(&w, i915, HZ / 20 /* 50ms timeout*/)
+ i915_request_wait(rq,
+ I915_WAIT_LOCKED,
+ MAX_SCHEDULE_TIMEOUT);
+ if (i915_terminally_wedged(&i915->gpu_error))
+ err = -EIO;
+ }
+
+ i915_request_put(rq);
+out:
+ hang_fini(&h);
+ return err;
+}
+
+static void force_reset(struct drm_i915_private *i915)
+{
+ i915_gem_set_wedged(i915);
+ i915_reset(i915, 0, NULL);
+}
+
+static int igt_atomic_reset(void *arg)
+{
+ static const struct atomic_section phases[] = {
+ { "preempt", __preempt_begin, __preempt_end },
+ { "softirq", __softirq_begin, __softirq_end },
+ { "hardirq", __hardirq_begin, __hardirq_end },
+ { }
+ };
+ struct drm_i915_private *i915 = arg;
+ intel_wakeref_t wakeref;
+ int err = 0;
+
+ /* Check that the resets are usable from atomic context */
+
+ if (USES_GUC_SUBMISSION(i915))
+ return 0; /* guc is dead; long live the guc */
+
+ igt_global_reset_lock(i915);
+ mutex_lock(&i915->drm.struct_mutex);
+ wakeref = intel_runtime_pm_get(i915);
+
+ /* Flush any requests before we get started and check basics */
+ force_reset(i915);
+ if (i915_terminally_wedged(&i915->gpu_error))
+ goto unlock;
+
+ if (intel_has_gpu_reset(i915)) {
+ const typeof(*phases) *p;
+
+ for (p = phases; p->name; p++) {
+ GEM_TRACE("intel_gpu_reset under %s\n", p->name);
+
+ p->critical_section_begin();
+ err = intel_gpu_reset(i915, ALL_ENGINES);
+ p->critical_section_end();
+
+ if (err) {
+ pr_err("intel_gpu_reset failed under %s\n",
+ p->name);
+ goto out;
+ }
+ }
+
+ force_reset(i915);
+ }
+
+ if (intel_has_reset_engine(i915)) {
+ struct intel_engine_cs *engine;
+ enum intel_engine_id id;
+
+ for_each_engine(engine, i915, id) {
+ const typeof(*phases) *p;
+
+ for (p = phases; p->name; p++) {
+ err = igt_atomic_reset_engine(engine, p);
+ if (err)
+ goto out;
+ }
+ }
+ }
+
+out:
+ /* As we poke around the guts, do a full reset before continuing. */
+ force_reset(i915);
+
+unlock:
+ intel_runtime_pm_put(i915, wakeref);
+ mutex_unlock(&i915->drm.struct_mutex);
+ igt_global_reset_unlock(i915);
+
+ return err;
+}
+
int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
{
static const struct i915_subtest tests[] = {
SUBTEST(igt_global_reset), /* attempt to recover GPU first */
+ SUBTEST(igt_wedged_reset),
SUBTEST(igt_hang_sanitycheck),
SUBTEST(igt_reset_idle_engine),
SUBTEST(igt_reset_active_engine),
@@ -1463,7 +1651,9 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
SUBTEST(igt_reset_evict_ppgtt),
SUBTEST(igt_reset_evict_fence),
SUBTEST(igt_handle_error),
+ SUBTEST(igt_atomic_reset),
};
+ intel_wakeref_t wakeref;
bool saved_hangcheck;
int err;
@@ -1473,8 +1663,9 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
if (i915_terminally_wedged(&i915->gpu_error))
return -EIO; /* we're long past hope of a successful reset */
- intel_runtime_pm_get(i915);
+ wakeref = intel_runtime_pm_get(i915);
saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
+ drain_delayed_work(&i915->gpu_error.hangcheck_work); /* flush param */
err = i915_subtests(tests, i915);
@@ -1483,7 +1674,7 @@ int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
mutex_unlock(&i915->drm.struct_mutex);
i915_modparams.enable_hangcheck = saved_hangcheck;
- intel_runtime_pm_put(i915);
+ intel_runtime_pm_put(i915, wakeref);
return err;
}