diff options
Diffstat (limited to 'drivers/gpu/drm/i915/i915_gpu_error.c')
| -rw-r--r-- | drivers/gpu/drm/i915/i915_gpu_error.c | 1228 |
1 files changed, 578 insertions, 650 deletions
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c index aa6791255252..3c85cb0ee99f 100644 --- a/drivers/gpu/drm/i915/i915_gpu_error.c +++ b/drivers/gpu/drm/i915/i915_gpu_error.c @@ -29,56 +29,27 @@ #include <linux/ascii85.h> #include <linux/nmi.h> +#include <linux/pagevec.h> #include <linux/scatterlist.h> -#include <linux/stop_machine.h> #include <linux/utsname.h> #include <linux/zlib.h> #include <drm/drm_print.h> -#include "i915_gpu_error.h" -#include "i915_drv.h" - -static inline const struct intel_engine_cs * -engine_lookup(const struct drm_i915_private *i915, unsigned int id) -{ - if (id >= I915_NUM_ENGINES) - return NULL; - - return i915->engine[id]; -} - -static inline const char * -__engine_name(const struct intel_engine_cs *engine) -{ - return engine ? engine->name : ""; -} +#include "display/intel_atomic.h" +#include "display/intel_overlay.h" -static const char * -engine_name(const struct drm_i915_private *i915, unsigned int id) -{ - return __engine_name(engine_lookup(i915, id)); -} +#include "gem/i915_gem_context.h" +#include "gem/i915_gem_lmem.h" -static const char *tiling_flag(int tiling) -{ - switch (tiling) { - default: - case I915_TILING_NONE: return ""; - case I915_TILING_X: return " X"; - case I915_TILING_Y: return " Y"; - } -} - -static const char *dirty_flag(int dirty) -{ - return dirty ? " dirty" : ""; -} +#include "i915_drv.h" +#include "i915_gpu_error.h" +#include "i915_memcpy.h" +#include "i915_scatterlist.h" +#include "intel_csr.h" -static const char *purgeable_flag(int purgeable) -{ - return purgeable ? " purgeable" : ""; -} +#define ALLOW_FAIL (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN) +#define ATOMIC_MAYFAIL (GFP_ATOMIC | __GFP_NOWARN) static void __sg_set_buf(struct scatterlist *sg, void *addr, unsigned int len, loff_t it) @@ -107,7 +78,7 @@ static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len) if (e->cur == e->end) { struct scatterlist *sgl; - sgl = (typeof(sgl))__get_free_page(GFP_KERNEL); + sgl = (typeof(sgl))__get_free_page(ALLOW_FAIL); if (!sgl) { e->err = -ENOMEM; return false; @@ -127,7 +98,7 @@ static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len) } e->size = ALIGN(len + 1, SZ_64K); - e->buf = kmalloc(e->size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); + e->buf = kmalloc(e->size, ALLOW_FAIL); if (!e->buf) { e->size = PAGE_ALIGN(len + 1); e->buf = kmalloc(e->size, GFP_KERNEL); @@ -204,47 +175,116 @@ i915_error_printer(struct drm_i915_error_state_buf *e) return p; } +/* single threaded page allocator with a reserved stash for emergencies */ +static void pool_fini(struct pagevec *pv) +{ + pagevec_release(pv); +} + +static int pool_refill(struct pagevec *pv, gfp_t gfp) +{ + while (pagevec_space(pv)) { + struct page *p; + + p = alloc_page(gfp); + if (!p) + return -ENOMEM; + + pagevec_add(pv, p); + } + + return 0; +} + +static int pool_init(struct pagevec *pv, gfp_t gfp) +{ + int err; + + pagevec_init(pv); + + err = pool_refill(pv, gfp); + if (err) + pool_fini(pv); + + return err; +} + +static void *pool_alloc(struct pagevec *pv, gfp_t gfp) +{ + struct page *p; + + p = alloc_page(gfp); + if (!p && pagevec_count(pv)) + p = pv->pages[--pv->nr]; + + return p ? page_address(p) : NULL; +} + +static void pool_free(struct pagevec *pv, void *addr) +{ + struct page *p = virt_to_page(addr); + + if (pagevec_space(pv)) + pagevec_add(pv, p); + else + __free_page(p); +} + #ifdef CONFIG_DRM_I915_COMPRESS_ERROR struct compress { + struct pagevec pool; struct z_stream_s zstream; void *tmp; + bool wc; }; static bool compress_init(struct compress *c) { - struct z_stream_s *zstream = memset(&c->zstream, 0, sizeof(c->zstream)); + struct z_stream_s *zstream = &c->zstream; - zstream->workspace = - kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), - GFP_ATOMIC | __GFP_NOWARN); - if (!zstream->workspace) + if (pool_init(&c->pool, ALLOW_FAIL)) return false; - if (zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) != Z_OK) { - kfree(zstream->workspace); + zstream->workspace = + kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), + ALLOW_FAIL); + if (!zstream->workspace) { + pool_fini(&c->pool); return false; } c->tmp = NULL; if (i915_has_memcpy_from_wc()) - c->tmp = (void *)__get_free_page(GFP_ATOMIC | __GFP_NOWARN); + c->tmp = pool_alloc(&c->pool, ALLOW_FAIL); return true; } -static void *compress_next_page(struct drm_i915_error_object *dst) +static bool compress_start(struct compress *c) +{ + struct z_stream_s *zstream = &c->zstream; + void *workspace = zstream->workspace; + + memset(zstream, 0, sizeof(*zstream)); + zstream->workspace = workspace; + + return zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) == Z_OK; +} + +static void *compress_next_page(struct compress *c, + struct drm_i915_error_object *dst) { - unsigned long page; + void *page; if (dst->page_count >= dst->num_pages) return ERR_PTR(-ENOSPC); - page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN); + page = pool_alloc(&c->pool, ALLOW_FAIL); if (!page) return ERR_PTR(-ENOMEM); - return dst->pages[dst->page_count++] = (void *)page; + return dst->pages[dst->page_count++] = page; } static int compress_page(struct compress *c, @@ -254,13 +294,13 @@ static int compress_page(struct compress *c, struct z_stream_s *zstream = &c->zstream; zstream->next_in = src; - if (c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE)) + if (c->wc && c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE)) zstream->next_in = c->tmp; zstream->avail_in = PAGE_SIZE; do { if (zstream->avail_out == 0) { - zstream->next_out = compress_next_page(dst); + zstream->next_out = compress_next_page(c, dst); if (IS_ERR(zstream->next_out)) return PTR_ERR(zstream->next_out); @@ -269,8 +309,6 @@ static int compress_page(struct compress *c, if (zlib_deflate(zstream, Z_NO_FLUSH) != Z_OK) return -EIO; - - touch_nmi_watchdog(); } while (zstream->avail_in); /* Fallback to uncompressed if we increase size? */ @@ -288,7 +326,7 @@ static int compress_flush(struct compress *c, do { switch (zlib_deflate(zstream, Z_FINISH)) { case Z_OK: /* more space requested */ - zstream->next_out = compress_next_page(dst); + zstream->next_out = compress_next_page(c, dst); if (IS_ERR(zstream->next_out)) return PTR_ERR(zstream->next_out); @@ -309,15 +347,17 @@ end: return 0; } -static void compress_fini(struct compress *c, - struct drm_i915_error_object *dst) +static void compress_finish(struct compress *c) { - struct z_stream_s *zstream = &c->zstream; + zlib_deflateEnd(&c->zstream); +} - zlib_deflateEnd(zstream); - kfree(zstream->workspace); +static void compress_fini(struct compress *c) +{ + kfree(c->zstream.workspace); if (c->tmp) - free_page((unsigned long)c->tmp); + pool_free(&c->pool, c->tmp); + pool_fini(&c->pool); } static void err_compression_marker(struct drm_i915_error_state_buf *m) @@ -328,10 +368,17 @@ static void err_compression_marker(struct drm_i915_error_state_buf *m) #else struct compress { + struct pagevec pool; + bool wc; }; static bool compress_init(struct compress *c) { + return pool_init(&c->pool, ALLOW_FAIL) == 0; +} + +static bool compress_start(struct compress *c) +{ return true; } @@ -339,15 +386,13 @@ static int compress_page(struct compress *c, void *src, struct drm_i915_error_object *dst) { - unsigned long page; void *ptr; - page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN); - if (!page) + ptr = pool_alloc(&c->pool, ALLOW_FAIL); + if (!ptr) return -ENOMEM; - ptr = (void *)page; - if (!i915_memcpy_from_wc(ptr, src, PAGE_SIZE)) + if (!(c->wc && i915_memcpy_from_wc(ptr, src, PAGE_SIZE))) memcpy(ptr, src, PAGE_SIZE); dst->pages[dst->page_count++] = ptr; @@ -360,11 +405,15 @@ static int compress_flush(struct compress *c, return 0; } -static void compress_fini(struct compress *c, - struct drm_i915_error_object *dst) +static void compress_finish(struct compress *c) { } +static void compress_fini(struct compress *c) +{ + pool_fini(&c->pool); +} + static void err_compression_marker(struct drm_i915_error_state_buf *m) { err_puts(m, "~"); @@ -372,49 +421,17 @@ static void err_compression_marker(struct drm_i915_error_state_buf *m) #endif -static void print_error_buffers(struct drm_i915_error_state_buf *m, - const char *name, - struct drm_i915_error_buffer *err, - int count) -{ - err_printf(m, "%s [%d]:\n", name, count); - - while (count--) { - err_printf(m, " %08x_%08x %8u %02x %02x %02x", - upper_32_bits(err->gtt_offset), - lower_32_bits(err->gtt_offset), - err->size, - err->read_domains, - err->write_domain, - err->wseqno); - err_puts(m, tiling_flag(err->tiling)); - err_puts(m, dirty_flag(err->dirty)); - err_puts(m, purgeable_flag(err->purgeable)); - err_puts(m, err->userptr ? " userptr" : ""); - err_puts(m, err->engine != -1 ? " " : ""); - err_puts(m, engine_name(m->i915, err->engine)); - err_puts(m, i915_cache_level_str(m->i915, err->cache_level)); - - if (err->name) - err_printf(m, " (name: %d)", err->name); - if (err->fence_reg != I915_FENCE_REG_NONE) - err_printf(m, " (fence: %d)", err->fence_reg); - - err_puts(m, "\n"); - err++; - } -} - static void error_print_instdone(struct drm_i915_error_state_buf *m, const struct drm_i915_error_engine *ee) { + const struct sseu_dev_info *sseu = &RUNTIME_INFO(m->i915)->sseu; int slice; int subslice; err_printf(m, " INSTDONE: 0x%08x\n", ee->instdone.instdone); - if (ee->engine_id != RCS || INTEL_GEN(m->i915) <= 3) + if (ee->engine->class != RENDER_CLASS || INTEL_GEN(m->i915) <= 3) return; err_printf(m, " SC_INSTDONE: 0x%08x\n", @@ -423,22 +440,17 @@ static void error_print_instdone(struct drm_i915_error_state_buf *m, if (INTEL_GEN(m->i915) <= 6) return; - for_each_instdone_slice_subslice(m->i915, slice, subslice) + for_each_instdone_slice_subslice(m->i915, sseu, slice, subslice) err_printf(m, " SAMPLER_INSTDONE[%d][%d]: 0x%08x\n", slice, subslice, ee->instdone.sampler[slice][subslice]); - for_each_instdone_slice_subslice(m->i915, slice, subslice) + for_each_instdone_slice_subslice(m->i915, sseu, slice, subslice) err_printf(m, " ROW_INSTDONE[%d][%d]: 0x%08x\n", slice, subslice, ee->instdone.row[slice][subslice]); } -static const char *bannable(const struct drm_i915_error_context *ctx) -{ - return ctx->bannable ? "" : " (unbannable)"; -} - static void error_print_request(struct drm_i915_error_state_buf *m, const char *prefix, const struct drm_i915_error_request *erq, @@ -447,9 +459,8 @@ static void error_print_request(struct drm_i915_error_state_buf *m, if (!erq->seqno) return; - err_printf(m, "%s pid %d, ban score %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n", - prefix, erq->pid, erq->ban_score, - erq->context, erq->seqno, + err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n", + prefix, erq->pid, erq->context, erq->seqno, test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &erq->flags) ? "!" : "", test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, @@ -463,9 +474,8 @@ static void error_print_context(struct drm_i915_error_state_buf *m, const char *header, const struct drm_i915_error_context *ctx) { - err_printf(m, "%s%s[%d] user_handle %d hw_id %d, prio %d, ban score %d%s guilty %d active %d\n", - header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id, - ctx->sched_attr.priority, ctx->ban_score, bannable(ctx), + err_printf(m, "%s%s[%d] prio %d, guilty %d active %d\n", + header, ctx->comm, ctx->pid, ctx->sched_attr.priority, ctx->guilty, ctx->active); } @@ -475,8 +485,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m, { int n; - err_printf(m, "%s command stream:\n", - engine_name(m->i915, ee->engine_id)); + err_printf(m, "%s command stream:\n", ee->engine->name); err_printf(m, " IDLE?: %s\n", yesno(ee->idle)); err_printf(m, " START: 0x%08x\n", ee->start); err_printf(m, " HEAD: 0x%08x [0x%08x]\n", ee->head, ee->rq_head); @@ -512,13 +521,6 @@ static void error_print_engine(struct drm_i915_error_state_buf *m, if (INTEL_GEN(m->i915) >= 6) { err_printf(m, " RC PSMI: 0x%08x\n", ee->rc_psmi); err_printf(m, " FAULT_REG: 0x%08x\n", ee->fault_reg); - err_printf(m, " SYNC_0: 0x%08x\n", - ee->semaphore_mboxes[0]); - err_printf(m, " SYNC_1: 0x%08x\n", - ee->semaphore_mboxes[1]); - if (HAS_VEBOX(m->i915)) - err_printf(m, " SYNC_2: 0x%08x\n", - ee->semaphore_mboxes[2]); } if (HAS_PPGTT(m->i915)) { err_printf(m, " GFX_MODE: 0x%08x\n", ee->vm_info.gfx_mode); @@ -533,14 +535,8 @@ static void error_print_engine(struct drm_i915_error_state_buf *m, ee->vm_info.pp_dir_base); } } - err_printf(m, " seqno: 0x%08x\n", ee->seqno); - err_printf(m, " last_seqno: 0x%08x\n", ee->last_seqno); err_printf(m, " ring->head: 0x%08x\n", ee->cpu_ring_head); err_printf(m, " ring->tail: 0x%08x\n", ee->cpu_ring_tail); - err_printf(m, " hangcheck timestamp: %dms (%lu%s)\n", - jiffies_to_msecs(ee->hangcheck_timestamp - epoch), - ee->hangcheck_timestamp, - ee->hangcheck_timestamp == epoch ? "; epoch" : ""); err_printf(m, " engine reset count: %u\n", ee->reset_count); for (n = 0; n < ee->num_ports; n++) { @@ -561,9 +557,9 @@ void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...) } static void print_error_obj(struct drm_i915_error_state_buf *m, - struct intel_engine_cs *engine, + const struct intel_engine_cs *engine, const char *name, - struct drm_i915_error_object *obj) + const struct drm_i915_error_object *obj) { char out[ASCII85_BUFSZ]; int page; @@ -578,6 +574,9 @@ static void print_error_obj(struct drm_i915_error_state_buf *m, lower_32_bits(obj->gtt_offset)); } + if (obj->gtt_page_sizes > I915_GTT_PAGE_SIZE_4K) + err_printf(m, "gtt_page_sizes = 0x%08x\n", obj->gtt_page_sizes); + err_compression_marker(m); for (page = 0; page < obj->page_count; page++) { int i, len; @@ -632,7 +631,7 @@ static void err_print_uc(struct drm_i915_error_state_buf *m, const struct i915_gpu_state *error = container_of(error_uc, typeof(*error), uc); - if (!error->device_info.has_guc) + if (!error->device_info.has_gt_uc) return; intel_uc_fw_dump(&error_uc->guc_fw, &p); @@ -660,7 +659,7 @@ static void err_free_sgl(struct scatterlist *sgl) static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, struct i915_gpu_state *error) { - struct drm_i915_error_object *obj; + const struct drm_i915_error_engine *ee; struct timespec64 ts; int i, j; @@ -669,6 +668,7 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, err_printf(m, "Kernel: %s %s\n", init_utsname()->release, init_utsname()->machine); + err_printf(m, "Driver: %s\n", DRIVER_DATE); ts = ktime_to_timespec64(error->time); err_printf(m, "Time: %lld s %ld us\n", (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC); @@ -678,26 +678,21 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, ts = ktime_to_timespec64(error->uptime); err_printf(m, "Uptime: %lld s %ld us\n", (s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC); - err_printf(m, "Epoch: %lu jiffies (%u HZ)\n", error->epoch, HZ); - err_printf(m, "Capture: %lu jiffies; %d ms ago, %d ms after epoch\n", - error->capture, - jiffies_to_msecs(jiffies - error->capture), - jiffies_to_msecs(error->capture - error->epoch)); - - for (i = 0; i < ARRAY_SIZE(error->engine); i++) { - if (!error->engine[i].context.pid) - continue; + err_printf(m, "Capture: %lu jiffies; %d ms ago\n", + error->capture, jiffies_to_msecs(jiffies - error->capture)); + + for (ee = error->engine; ee; ee = ee->next) + err_printf(m, "Active process (on ring %s): %s [%d]\n", + ee->engine->name, + ee->context.comm, + ee->context.pid); - err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n", - engine_name(m->i915, i), - error->engine[i].context.comm, - error->engine[i].context.pid, - error->engine[i].context.ban_score, - bannable(&error->engine[i].context)); - } err_printf(m, "Reset count: %u\n", error->reset_count); err_printf(m, "Suspend count: %u\n", error->suspend_count); err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform)); + err_printf(m, "Subplatform: 0x%x\n", + intel_subplatform(&error->runtime_info, + error->device_info.platform)); err_print_pciid(m, m->i915); err_printf(m, "IOMMU enabled?: %d\n", error->iommu); @@ -727,101 +722,73 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m, for (i = 0; i < error->nfence; i++) err_printf(m, " fence[%d] = %08llx\n", i, error->fence[i]); - if (INTEL_GEN(m->i915) >= 6) { + if (IS_GEN_RANGE(m->i915, 6, 11)) { err_printf(m, "ERROR: 0x%08x\n", error->error); - - if (INTEL_GEN(m->i915) >= 8) - err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n", - error->fault_data1, error->fault_data0); - err_printf(m, "DONE_REG: 0x%08x\n", error->done_reg); } + if (INTEL_GEN(m->i915) >= 8) + err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n", + error->fault_data1, error->fault_data0); + if (IS_GEN(m->i915, 7)) err_printf(m, "ERR_INT: 0x%08x\n", error->err_int); - for (i = 0; i < ARRAY_SIZE(error->engine); i++) { - if (error->engine[i].engine_id != -1) - error_print_engine(m, &error->engine[i], error->epoch); - } + if (IS_GEN_RANGE(m->i915, 8, 11)) + err_printf(m, "GTT_CACHE_EN: 0x%08x\n", error->gtt_cache); - for (i = 0; i < ARRAY_SIZE(error->active_vm); i++) { - char buf[128]; - int len, first = 1; + if (IS_GEN(m->i915, 12)) + err_printf(m, "AUX_ERR_DBG: 0x%08x\n", error->aux_err); - if (!error->active_vm[i]) - break; + if (INTEL_GEN(m->i915) >= 12) { + int i; - len = scnprintf(buf, sizeof(buf), "Active ("); - for (j = 0; j < ARRAY_SIZE(error->engine); j++) { - if (error->engine[j].vm != error->active_vm[i]) - continue; + for (i = 0; i < GEN12_SFC_DONE_MAX; i++) + err_printf(m, " SFC_DONE[%d]: 0x%08x\n", i, + error->sfc_done[i]); - len += scnprintf(buf + len, sizeof(buf), "%s%s", - first ? "" : ", ", - m->i915->engine[j]->name); - first = 0; - } - scnprintf(buf + len, sizeof(buf), ")"); - print_error_buffers(m, buf, - error->active_bo[i], - error->active_bo_count[i]); + err_printf(m, " GAM_DONE: 0x%08x\n", error->gam_done); } - print_error_buffers(m, "Pinned (global)", - error->pinned_bo, - error->pinned_bo_count); + for (ee = error->engine; ee; ee = ee->next) + error_print_engine(m, ee, error->capture); - for (i = 0; i < ARRAY_SIZE(error->engine); i++) { - const struct drm_i915_error_engine *ee = &error->engine[i]; + for (ee = error->engine; ee; ee = ee->next) { + const struct drm_i915_error_object *obj; obj = ee->batchbuffer; if (obj) { - err_puts(m, m->i915->engine[i]->name); + err_puts(m, ee->engine->name); if (ee->context.pid) - err_printf(m, " (submitted by %s [%d], ctx %d [%d], score %d%s)", + err_printf(m, " (submitted by %s [%d])", ee->context.comm, - ee->context.pid, - ee->context.handle, - ee->context.hw_id, - ee->context.ban_score, - bannable(&ee->context)); + ee->context.pid); err_printf(m, " --- gtt_offset = 0x%08x %08x\n", upper_32_bits(obj->gtt_offset), lower_32_bits(obj->gtt_offset)); - print_error_obj(m, m->i915->engine[i], NULL, obj); + print_error_obj(m, ee->engine, NULL, obj); } for (j = 0; j < ee->user_bo_count; j++) - print_error_obj(m, m->i915->engine[i], - "user", ee->user_bo[j]); + print_error_obj(m, ee->engine, "user", ee->user_bo[j]); if (ee->num_requests) { err_printf(m, "%s --- %d requests\n", - m->i915->engine[i]->name, + ee->engine->name, ee->num_requests); for (j = 0; j < ee->num_requests; j++) error_print_request(m, " ", &ee->requests[j], - error->epoch); + error->capture); } - print_error_obj(m, m->i915->engine[i], - "ringbuffer", ee->ringbuffer); - - print_error_obj(m, m->i915->engine[i], - "HW Status", ee->hws_page); - - print_error_obj(m, m->i915->engine[i], - "HW context", ee->ctx); - - print_error_obj(m, m->i915->engine[i], - "WA context", ee->wa_ctx); - - print_error_obj(m, m->i915->engine[i], + print_error_obj(m, ee->engine, "ringbuffer", ee->ringbuffer); + print_error_obj(m, ee->engine, "HW Status", ee->hws_page); + print_error_obj(m, ee->engine, "HW context", ee->ctx); + print_error_obj(m, ee->engine, "WA context", ee->wa_ctx); + print_error_obj(m, ee->engine, "WA batchbuffer", ee->wa_batchbuffer); - - print_error_obj(m, m->i915->engine[i], + print_error_obj(m, ee->engine, "NULL context", ee->default_state); } @@ -970,13 +937,15 @@ void __i915_gpu_state_free(struct kref *error_ref) { struct i915_gpu_state *error = container_of(error_ref, typeof(*error), ref); - long i, j; + long i; - for (i = 0; i < ARRAY_SIZE(error->engine); i++) { - struct drm_i915_error_engine *ee = &error->engine[i]; + while (error->engine) { + struct drm_i915_error_engine *ee = error->engine; - for (j = 0; j < ee->user_bo_count; j++) - i915_error_object_free(ee->user_bo[j]); + error->engine = ee->next; + + for (i = 0; i < ee->user_bo_count; i++) + i915_error_object_free(ee->user_bo[i]); kfree(ee->user_bo); i915_error_object_free(ee->batchbuffer); @@ -987,12 +956,9 @@ void __i915_gpu_state_free(struct kref *error_ref) i915_error_object_free(ee->wa_ctx); kfree(ee->requests); + kfree(ee); } - for (i = 0; i < ARRAY_SIZE(error->active_bo); i++) - kfree(error->active_bo[i]); - kfree(error->pinned_bo); - kfree(error->overlay); kfree(error->display); @@ -1005,130 +971,98 @@ void __i915_gpu_state_free(struct kref *error_ref) static struct drm_i915_error_object * i915_error_object_create(struct drm_i915_private *i915, - struct i915_vma *vma) + struct i915_vma *vma, + struct compress *compress) { struct i915_ggtt *ggtt = &i915->ggtt; const u64 slot = ggtt->error_capture.start; struct drm_i915_error_object *dst; - struct compress compress; unsigned long num_pages; struct sgt_iter iter; - dma_addr_t dma; int ret; + might_sleep(); + if (!vma || !vma->pages) return NULL; num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT; num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */ - dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *), - GFP_ATOMIC | __GFP_NOWARN); + dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *), ALLOW_FAIL); if (!dst) return NULL; + if (!compress_start(compress)) { + kfree(dst); + return NULL; + } + dst->gtt_offset = vma->node.start; dst->gtt_size = vma->node.size; + dst->gtt_page_sizes = vma->page_sizes.gtt; dst->num_pages = num_pages; dst->page_count = 0; dst->unused = 0; - if (!compress_init(&compress)) { - kfree(dst); - return NULL; - } + compress->wc = i915_gem_object_is_lmem(vma->obj) || + drm_mm_node_allocated(&ggtt->error_capture); ret = -EINVAL; - for_each_sgt_dma(dma, iter, vma->pages) { + if (drm_mm_node_allocated(&ggtt->error_capture)) { void __iomem *s; + dma_addr_t dma; - ggtt->vm.insert_page(&ggtt->vm, dma, slot, I915_CACHE_NONE, 0); - - s = io_mapping_map_atomic_wc(&ggtt->iomap, slot); - ret = compress_page(&compress, (void __force *)s, dst); - io_mapping_unmap_atomic(s); - if (ret) - break; - } - - if (ret || compress_flush(&compress, dst)) { - while (dst->page_count--) - free_page((unsigned long)dst->pages[dst->page_count]); - kfree(dst); - dst = NULL; - } - - compress_fini(&compress, dst); - return dst; -} - -/* The error capture is special as tries to run underneath the normal - * locking rules - so we use the raw version of the i915_active_request lookup. - */ -static inline u32 -__active_get_seqno(struct i915_active_request *active) -{ - struct i915_request *request; - - request = __i915_active_request_peek(active); - return request ? request->global_seqno : 0; -} + for_each_sgt_daddr(dma, iter, vma->pages) { + ggtt->vm.insert_page(&ggtt->vm, dma, slot, + I915_CACHE_NONE, 0); -static inline int -__active_get_engine_id(struct i915_active_request *active) -{ - struct i915_request *request; - - request = __i915_active_request_peek(active); - return request ? request->engine->id : -1; -} - -static void capture_bo(struct drm_i915_error_buffer *err, - struct i915_vma *vma) -{ - struct drm_i915_gem_object *obj = vma->obj; + s = io_mapping_map_wc(&ggtt->iomap, slot, PAGE_SIZE); + ret = compress_page(compress, (void __force *)s, dst); + io_mapping_unmap(s); + if (ret) + break; + } + } else if (i915_gem_object_is_lmem(vma->obj)) { + struct intel_memory_region *mem = vma->obj->mm.region; + dma_addr_t dma; - err->size = obj->base.size; - err->name = obj->base.name; + for_each_sgt_daddr(dma, iter, vma->pages) { + void __iomem *s; - err->wseqno = __active_get_seqno(&obj->frontbuffer_write); - err->engine = __active_get_engine_id(&obj->frontbuffer_write); + s = io_mapping_map_wc(&mem->iomap, dma, PAGE_SIZE); + ret = compress_page(compress, (void __force *)s, dst); + io_mapping_unmap(s); + if (ret) + break; + } + } else { + struct page *page; - err->gtt_offset = vma->node.start; - err->read_domains = obj->read_domains; - err->write_domain = obj->write_domain; - err->fence_reg = vma->fence ? vma->fence->id : -1; - err->tiling = i915_gem_object_get_tiling(obj); - err->dirty = obj->mm.dirty; - err->purgeable = obj->mm.madv != I915_MADV_WILLNEED; - err->userptr = obj->userptr.mm != NULL; - err->cache_level = obj->cache_level; -} + for_each_sgt_page(page, iter, vma->pages) { + void *s; -static u32 capture_error_bo(struct drm_i915_error_buffer *err, - int count, struct list_head *head, - unsigned int flags) -#define ACTIVE_ONLY BIT(0) -#define PINNED_ONLY BIT(1) -{ - struct i915_vma *vma; - int i = 0; + drm_clflush_pages(&page, 1); - list_for_each_entry(vma, head, vm_link) { - if (!vma->obj) - continue; + s = kmap(page); + ret = compress_page(compress, s, dst); + kunmap(s); - if (flags & ACTIVE_ONLY && !i915_vma_is_active(vma)) - continue; + drm_clflush_pages(&page, 1); - if (flags & PINNED_ONLY && !i915_vma_is_pinned(vma)) - continue; + if (ret) + break; + } + } - capture_bo(err++, vma); - if (++i == count) - break; + if (ret || compress_flush(compress, dst)) { + while (dst->page_count--) + pool_free(&compress->pool, dst->pages[dst->page_count]); + kfree(dst); + dst = NULL; } + compress_finish(compress); - return i; + return dst; } /* @@ -1141,55 +1075,43 @@ static u32 capture_error_bo(struct drm_i915_error_buffer *err, * * It's only a small step better than a random number in its current form. */ -static u32 i915_error_generate_code(struct i915_gpu_state *error, - unsigned long engine_mask) +static u32 i915_error_generate_code(struct i915_gpu_state *error) { + const struct drm_i915_error_engine *ee = error->engine; + /* * IPEHR would be an ideal way to detect errors, as it's the gross * measure of "the command that hung." However, has some very common * synchronization commands which almost always appear in the case * strictly a client bug. Use instdone to differentiate those some. */ - if (engine_mask) { - struct drm_i915_error_engine *ee = - &error->engine[ffs(engine_mask)]; - - return ee->ipehr ^ ee->instdone.instdone; - } - - return 0; + return ee ? ee->ipehr ^ ee->instdone.instdone : 0; } static void gem_record_fences(struct i915_gpu_state *error) { struct drm_i915_private *dev_priv = error->i915; + struct intel_uncore *uncore = &dev_priv->uncore; int i; if (INTEL_GEN(dev_priv) >= 6) { - for (i = 0; i < dev_priv->num_fence_regs; i++) - error->fence[i] = I915_READ64(FENCE_REG_GEN6_LO(i)); + for (i = 0; i < dev_priv->ggtt.num_fences; i++) + error->fence[i] = + intel_uncore_read64(uncore, + FENCE_REG_GEN6_LO(i)); } else if (INTEL_GEN(dev_priv) >= 4) { - for (i = 0; i < dev_priv->num_fence_regs; i++) - error->fence[i] = I915_READ64(FENCE_REG_965_LO(i)); + for (i = 0; i < dev_priv->ggtt.num_fences; i++) + error->fence[i] = + intel_uncore_read64(uncore, + FENCE_REG_965_LO(i)); } else { - for (i = 0; i < dev_priv->num_fence_regs; i++) - error->fence[i] = I915_READ(FENCE_REG(i)); + for (i = 0; i < dev_priv->ggtt.num_fences; i++) + error->fence[i] = + intel_uncore_read(uncore, FENCE_REG(i)); } error->nfence = i; } -static void gen6_record_semaphore_state(struct intel_engine_cs *engine, - struct drm_i915_error_engine *ee) -{ - struct drm_i915_private *dev_priv = engine->i915; - - ee->semaphore_mboxes[0] = I915_READ(RING_SYNC_0(engine->mmio_base)); - ee->semaphore_mboxes[1] = I915_READ(RING_SYNC_1(engine->mmio_base)); - if (HAS_VEBOX(dev_priv)) - ee->semaphore_mboxes[2] = - I915_READ(RING_SYNC_2(engine->mmio_base)); -} - static void error_record_engine_registers(struct i915_gpu_state *error, struct intel_engine_cs *engine, struct drm_i915_error_engine *ee) @@ -1197,44 +1119,43 @@ static void error_record_engine_registers(struct i915_gpu_state *error, struct drm_i915_private *dev_priv = engine->i915; if (INTEL_GEN(dev_priv) >= 6) { - ee->rc_psmi = I915_READ(RING_PSMI_CTL(engine->mmio_base)); - if (INTEL_GEN(dev_priv) >= 8) { + ee->rc_psmi = ENGINE_READ(engine, RING_PSMI_CTL); + + if (INTEL_GEN(dev_priv) >= 12) + ee->fault_reg = I915_READ(GEN12_RING_FAULT_REG); + else if (INTEL_GEN(dev_priv) >= 8) ee->fault_reg = I915_READ(GEN8_RING_FAULT_REG); - } else { - gen6_record_semaphore_state(engine, ee); - ee->fault_reg = I915_READ(RING_FAULT_REG(engine)); - } + else + ee->fault_reg = GEN6_RING_FAULT_REG_READ(engine); } if (INTEL_GEN(dev_priv) >= 4) { - ee->faddr = I915_READ(RING_DMA_FADD(engine->mmio_base)); - ee->ipeir = I915_READ(RING_IPEIR(engine->mmio_base)); - ee->ipehr = I915_READ(RING_IPEHR(engine->mmio_base)); - ee->instps = I915_READ(RING_INSTPS(engine->mmio_base)); - ee->bbaddr = I915_READ(RING_BBADDR(engine->mmio_base)); + ee->faddr = ENGINE_READ(engine, RING_DMA_FADD); + ee->ipeir = ENGINE_READ(engine, RING_IPEIR); + ee->ipehr = ENGINE_READ(engine, RING_IPEHR); + ee->instps = ENGINE_READ(engine, RING_INSTPS); + ee->bbaddr = ENGINE_READ(engine, RING_BBADDR); if (INTEL_GEN(dev_priv) >= 8) { - ee->faddr |= (u64) I915_READ(RING_DMA_FADD_UDW(engine->mmio_base)) << 32; - ee->bbaddr |= (u64) I915_READ(RING_BBADDR_UDW(engine->mmio_base)) << 32; + ee->faddr |= (u64)ENGINE_READ(engine, RING_DMA_FADD_UDW) << 32; + ee->bbaddr |= (u64)ENGINE_READ(engine, RING_BBADDR_UDW) << 32; } - ee->bbstate = I915_READ(RING_BBSTATE(engine->mmio_base)); + ee->bbstate = ENGINE_READ(engine, RING_BBSTATE); } else { - ee->faddr = I915_READ(DMA_FADD_I8XX); - ee->ipeir = I915_READ(IPEIR); - ee->ipehr = I915_READ(IPEHR); + ee->faddr = ENGINE_READ(engine, DMA_FADD_I8XX); + ee->ipeir = ENGINE_READ(engine, IPEIR); + ee->ipehr = ENGINE_READ(engine, IPEHR); } intel_engine_get_instdone(engine, &ee->instdone); - ee->instpm = I915_READ(RING_INSTPM(engine->mmio_base)); + ee->instpm = ENGINE_READ(engine, RING_INSTPM); ee->acthd = intel_engine_get_active_head(engine); - ee->seqno = intel_engine_get_seqno(engine); - ee->last_seqno = intel_engine_last_submit(engine); - ee->start = I915_READ_START(engine); - ee->head = I915_READ_HEAD(engine); - ee->tail = I915_READ_TAIL(engine); - ee->ctl = I915_READ_CTL(engine); + ee->start = ENGINE_READ(engine, RING_START); + ee->head = ENGINE_READ(engine, RING_HEAD); + ee->tail = ENGINE_READ(engine, RING_TAIL); + ee->ctl = ENGINE_READ(engine, RING_CTL); if (INTEL_GEN(dev_priv) > 2) - ee->mode = I915_READ_MODE(engine); + ee->mode = ENGINE_READ(engine, RING_MI_MODE); if (!HWS_NEEDS_PHYSICAL(dev_priv)) { i915_reg_t mmio; @@ -1242,16 +1163,18 @@ static void error_record_engine_registers(struct i915_gpu_state *error, if (IS_GEN(dev_priv, 7)) { switch (engine->id) { default: - case RCS: + MISSING_CASE(engine->id); + /* fall through */ + case RCS0: mmio = RENDER_HWS_PGA_GEN7; break; - case BCS: + case BCS0: mmio = BLT_HWS_PGA_GEN7; break; - case VCS: + case VCS0: mmio = BSD_HWS_PGA_GEN7; break; - case VECS: + case VECS0: mmio = VEBOX_HWS_PGA_GEN7; break; } @@ -1266,43 +1189,43 @@ static void error_record_engine_registers(struct i915_gpu_state *error, } ee->idle = intel_engine_is_idle(engine); - if (!ee->idle) - ee->hangcheck_timestamp = engine->hangcheck.action_timestamp; ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error, engine); if (HAS_PPGTT(dev_priv)) { int i; - ee->vm_info.gfx_mode = I915_READ(RING_MODE_GEN7(engine)); + ee->vm_info.gfx_mode = ENGINE_READ(engine, RING_MODE_GEN7); - if (IS_GEN(dev_priv, 6)) + if (IS_GEN(dev_priv, 6)) { ee->vm_info.pp_dir_base = - I915_READ(RING_PP_DIR_BASE_READ(engine)); - else if (IS_GEN(dev_priv, 7)) + ENGINE_READ(engine, RING_PP_DIR_BASE_READ); + } else if (IS_GEN(dev_priv, 7)) { ee->vm_info.pp_dir_base = - I915_READ(RING_PP_DIR_BASE(engine)); - else if (INTEL_GEN(dev_priv) >= 8) + ENGINE_READ(engine, RING_PP_DIR_BASE); + } else if (INTEL_GEN(dev_priv) >= 8) { + u32 base = engine->mmio_base; + for (i = 0; i < 4; i++) { ee->vm_info.pdp[i] = - I915_READ(GEN8_RING_PDP_UDW(engine, i)); + I915_READ(GEN8_RING_PDP_UDW(base, i)); ee->vm_info.pdp[i] <<= 32; ee->vm_info.pdp[i] |= - I915_READ(GEN8_RING_PDP_LDW(engine, i)); + I915_READ(GEN8_RING_PDP_LDW(base, i)); } + } } } -static void record_request(struct i915_request *request, +static void record_request(const struct i915_request *request, struct drm_i915_error_request *erq) { - struct i915_gem_context *ctx = request->gem_context; + const struct i915_gem_context *ctx = request->gem_context; erq->flags = request->fence.flags; - erq->context = ctx->hw_id; + erq->context = request->fence.context; + erq->seqno = request->fence.seqno; erq->sched_attr = request->sched.attr; - erq->ban_score = atomic_read(&ctx->ban_score); - erq->seqno = request->global_seqno; erq->jiffies = request->emitted_jiffies; erq->start = i915_ggtt_offset(request->ring->vma); erq->head = request->head; @@ -1322,12 +1245,12 @@ static void engine_record_requests(struct intel_engine_cs *engine, count = 0; request = first; - list_for_each_entry_from(request, &engine->timeline.requests, link) + list_for_each_entry_from(request, &engine->active.requests, sched.link) count++; if (!count) return; - ee->requests = kcalloc(count, sizeof(*ee->requests), GFP_ATOMIC); + ee->requests = kcalloc(count, sizeof(*ee->requests), ATOMIC_MAYFAIL); if (!ee->requests) return; @@ -1335,7 +1258,8 @@ static void engine_record_requests(struct intel_engine_cs *engine, count = 0; request = first; - list_for_each_entry_from(request, &engine->timeline.requests, link) { + list_for_each_entry_from(request, + &engine->active.requests, sched.link) { if (count >= ee->num_requests) { /* * If the ring request list was changed in @@ -1360,27 +1284,24 @@ static void engine_record_requests(struct intel_engine_cs *engine, ee->num_requests = count; } -static void error_record_engine_execlists(struct intel_engine_cs *engine, +static void error_record_engine_execlists(const struct intel_engine_cs *engine, struct drm_i915_error_engine *ee) { const struct intel_engine_execlists * const execlists = &engine->execlists; - unsigned int n; + struct i915_request * const *port = execlists->active; + unsigned int n = 0; - for (n = 0; n < execlists_num_ports(execlists); n++) { - struct i915_request *rq = port_request(&execlists->port[n]); - - if (!rq) - break; - - record_request(rq, &ee->execlist[n]); - } + while (*port) + record_request(*port++, &ee->execlist[n++]); ee->num_ports = n; } -static void record_context(struct drm_i915_error_context *e, - struct i915_gem_context *ctx) +static bool record_context(struct drm_i915_error_context *e, + const struct i915_request *rq) { + const struct i915_gem_context *ctx = rq->gem_context; + if (ctx->pid) { struct task_struct *task; @@ -1393,17 +1314,49 @@ static void record_context(struct drm_i915_error_context *e, rcu_read_unlock(); } - e->handle = ctx->user_handle; - e->hw_id = ctx->hw_id; e->sched_attr = ctx->sched; - e->ban_score = atomic_read(&ctx->ban_score); - e->bannable = i915_gem_context_is_bannable(ctx); e->guilty = atomic_read(&ctx->guilty_count); e->active = atomic_read(&ctx->active_count); + + return i915_gem_context_no_error_capture(ctx); } -static void request_record_user_bo(struct i915_request *request, - struct drm_i915_error_engine *ee) +struct capture_vma { + struct capture_vma *next; + void **slot; +}; + +static struct capture_vma * +capture_vma(struct capture_vma *next, + struct i915_vma *vma, + struct drm_i915_error_object **out) +{ + struct capture_vma *c; + + *out = NULL; + if (!vma) + return next; + + c = kmalloc(sizeof(*c), ATOMIC_MAYFAIL); + if (!c) + return next; + + if (!i915_active_acquire_if_busy(&vma->active)) { + kfree(c); + return next; + } + + c->slot = (void **)out; + *c->slot = i915_vma_get(vma); + + c->next = next; + return c; +} + +static struct capture_vma * +request_record_user_bo(struct i915_request *request, + struct drm_i915_error_engine *ee, + struct capture_vma *capture) { struct i915_capture_list *c; struct drm_i915_error_object **bo; @@ -1413,33 +1366,34 @@ static void request_record_user_bo(struct i915_request *request, for (c = request->capture_list; c; c = c->next) max++; if (!max) - return; + return capture; - bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC); + bo = kmalloc_array(max, sizeof(*bo), ATOMIC_MAYFAIL); if (!bo) { /* If we can't capture everything, try to capture something. */ max = min_t(long, max, PAGE_SIZE / sizeof(*bo)); - bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC); + bo = kmalloc_array(max, sizeof(*bo), ATOMIC_MAYFAIL); } if (!bo) - return; + return capture; count = 0; for (c = request->capture_list; c; c = c->next) { - bo[count] = i915_error_object_create(request->i915, c->vma); - if (!bo[count]) - break; + capture = capture_vma(capture, c->vma, &bo[count]); if (++count == max) break; } ee->user_bo = bo; ee->user_bo_count = count; + + return capture; } static struct drm_i915_error_object * capture_object(struct drm_i915_private *dev_priv, - struct drm_i915_gem_object *obj) + struct drm_i915_gem_object *obj, + struct compress *compress) { if (obj && i915_gem_object_has_pages(obj)) { struct i915_vma fake = { @@ -1449,184 +1403,147 @@ capture_object(struct drm_i915_private *dev_priv, .obj = obj, }; - return i915_error_object_create(dev_priv, &fake); + return i915_error_object_create(dev_priv, &fake, compress); } else { return NULL; } } -static void gem_record_rings(struct i915_gpu_state *error) +static void +gem_record_rings(struct i915_gpu_state *error, struct compress *compress) { struct drm_i915_private *i915 = error->i915; - struct i915_ggtt *ggtt = &i915->ggtt; - int i; + struct intel_engine_cs *engine; + struct drm_i915_error_engine *ee; + + ee = kzalloc(sizeof(*ee), GFP_KERNEL); + if (!ee) + return; - for (i = 0; i < I915_NUM_ENGINES; i++) { - struct intel_engine_cs *engine = i915->engine[i]; - struct drm_i915_error_engine *ee = &error->engine[i]; + for_each_uabi_engine(engine, i915) { + struct capture_vma *capture = NULL; struct i915_request *request; + unsigned long flags; - ee->engine_id = -1; + /* Refill our page pool before entering atomic section */ + pool_refill(&compress->pool, ALLOW_FAIL); - if (!engine) + spin_lock_irqsave(&engine->active.lock, flags); + request = intel_engine_find_active_request(engine); + if (!request) { + spin_unlock_irqrestore(&engine->active.lock, flags); continue; + } - ee->engine_id = i; + error->simulated |= record_context(&ee->context, request); - error_record_engine_registers(error, engine, ee); - error_record_engine_execlists(engine, ee); + /* + * We need to copy these to an anonymous buffer + * as the simplest method to avoid being overwritten + * by userspace. + */ + capture = capture_vma(capture, + request->batch, + &ee->batchbuffer); - request = i915_gem_find_active_request(engine); - if (request) { - struct i915_gem_context *ctx = request->gem_context; - struct intel_ring *ring; + if (HAS_BROKEN_CS_TLB(i915)) + capture = capture_vma(capture, + engine->gt->scratch, + &ee->wa_batchbuffer); - ee->vm = ctx->ppgtt ? &ctx->ppgtt->vm : &ggtt->vm; + capture = request_record_user_bo(request, ee, capture); - record_context(&ee->context, ctx); + capture = capture_vma(capture, + request->hw_context->state, + &ee->ctx); - /* We need to copy these to an anonymous buffer - * as the simplest method to avoid being overwritten - * by userspace. - */ - ee->batchbuffer = - i915_error_object_create(i915, request->batch); + capture = capture_vma(capture, + request->ring->vma, + &ee->ringbuffer); - if (HAS_BROKEN_CS_TLB(i915)) - ee->wa_batchbuffer = - i915_error_object_create(i915, - i915->gt.scratch); - request_record_user_bo(request, ee); + ee->cpu_ring_head = request->ring->head; + ee->cpu_ring_tail = request->ring->tail; - ee->ctx = - i915_error_object_create(i915, - request->hw_context->state); + ee->rq_head = request->head; + ee->rq_post = request->postfix; + ee->rq_tail = request->tail; - error->simulated |= - i915_gem_context_no_error_capture(ctx); + engine_record_requests(engine, request, ee); + spin_unlock_irqrestore(&engine->active.lock, flags); - ee->rq_head = request->head; - ee->rq_post = request->postfix; - ee->rq_tail = request->tail; + error_record_engine_registers(error, engine, ee); + error_record_engine_execlists(engine, ee); - ring = request->ring; - ee->cpu_ring_head = ring->head; - ee->cpu_ring_tail = ring->tail; - ee->ringbuffer = - i915_error_object_create(i915, ring->vma); + while (capture) { + struct capture_vma *this = capture; + struct i915_vma *vma = *this->slot; - engine_record_requests(engine, request, ee); + *this->slot = + i915_error_object_create(i915, vma, compress); + + i915_active_release(&vma->active); + i915_vma_put(vma); + + capture = this->next; + kfree(this); } ee->hws_page = i915_error_object_create(i915, - engine->status_page.vma); - - ee->wa_ctx = i915_error_object_create(i915, engine->wa_ctx.vma); - - ee->default_state = capture_object(i915, engine->default_state); - } -} - -static void gem_capture_vm(struct i915_gpu_state *error, - struct i915_address_space *vm, - int idx) -{ - struct drm_i915_error_buffer *active_bo; - struct i915_vma *vma; - int count; + engine->status_page.vma, + compress); - count = 0; - list_for_each_entry(vma, &vm->bound_list, vm_link) - if (i915_vma_is_active(vma)) - count++; - - active_bo = NULL; - if (count) - active_bo = kcalloc(count, sizeof(*active_bo), GFP_ATOMIC); - if (active_bo) - count = capture_error_bo(active_bo, - count, &vm->bound_list, - ACTIVE_ONLY); - else - count = 0; - - error->active_vm[idx] = vm; - error->active_bo[idx] = active_bo; - error->active_bo_count[idx] = count; -} + ee->wa_ctx = + i915_error_object_create(i915, + engine->wa_ctx.vma, + compress); -static void capture_active_buffers(struct i915_gpu_state *error) -{ - int cnt = 0, i, j; + ee->default_state = + capture_object(i915, engine->default_state, compress); - BUILD_BUG_ON(ARRAY_SIZE(error->engine) > ARRAY_SIZE(error->active_bo)); - BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_vm)); - BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_bo_count)); + ee->engine = engine; - /* Scan each engine looking for unique active contexts/vm */ - for (i = 0; i < ARRAY_SIZE(error->engine); i++) { - struct drm_i915_error_engine *ee = &error->engine[i]; - bool found; + ee->next = error->engine; + error->engine = ee; - if (!ee->vm) - continue; - - found = false; - for (j = 0; j < i && !found; j++) - found = error->engine[j].vm == ee->vm; - if (!found) - gem_capture_vm(error, ee->vm, cnt++); + ee = kzalloc(sizeof(*ee), GFP_KERNEL); + if (!ee) + return; } -} - -static void capture_pinned_buffers(struct i915_gpu_state *error) -{ - struct i915_address_space *vm = &error->i915->ggtt.vm; - struct drm_i915_error_buffer *bo; - struct i915_vma *vma; - int count; - - count = 0; - list_for_each_entry(vma, &vm->bound_list, vm_link) - count++; - - bo = NULL; - if (count) - bo = kcalloc(count, sizeof(*bo), GFP_ATOMIC); - if (!bo) - return; - error->pinned_bo_count = - capture_error_bo(bo, count, &vm->bound_list, PINNED_ONLY); - error->pinned_bo = bo; + kfree(ee); } -static void capture_uc_state(struct i915_gpu_state *error) +static void +capture_uc_state(struct i915_gpu_state *error, struct compress *compress) { struct drm_i915_private *i915 = error->i915; struct i915_error_uc *error_uc = &error->uc; + struct intel_uc *uc = &i915->gt.uc; /* Capturing uC state won't be useful if there is no GuC */ - if (!error->device_info.has_guc) + if (!error->device_info.has_gt_uc) return; - error_uc->guc_fw = i915->guc.fw; - error_uc->huc_fw = i915->huc.fw; + memcpy(&error_uc->guc_fw, &uc->guc.fw, sizeof(uc->guc.fw)); + memcpy(&error_uc->huc_fw, &uc->huc.fw, sizeof(uc->huc.fw)); /* Non-default firmware paths will be specified by the modparam. * As modparams are generally accesible from the userspace make * explicit copies of the firmware paths. */ - error_uc->guc_fw.path = kstrdup(i915->guc.fw.path, GFP_ATOMIC); - error_uc->huc_fw.path = kstrdup(i915->huc.fw.path, GFP_ATOMIC); - error_uc->guc_log = i915_error_object_create(i915, i915->guc.log.vma); + error_uc->guc_fw.path = kstrdup(uc->guc.fw.path, ALLOW_FAIL); + error_uc->huc_fw.path = kstrdup(uc->huc.fw.path, ALLOW_FAIL); + error_uc->guc_log = i915_error_object_create(i915, + uc->guc.log.vma, + compress); } /* Capture all registers which don't fit into another category. */ static void capture_reg_state(struct i915_gpu_state *error) { - struct drm_i915_private *dev_priv = error->i915; + struct drm_i915_private *i915 = error->i915; + struct intel_uncore *uncore = &i915->uncore; int i; /* General organization @@ -1638,95 +1555,125 @@ static void capture_reg_state(struct i915_gpu_state *error) */ /* 1: Registers specific to a single generation */ - if (IS_VALLEYVIEW(dev_priv)) { - error->gtier[0] = I915_READ(GTIER); - error->ier = I915_READ(VLV_IER); - error->forcewake = I915_READ_FW(FORCEWAKE_VLV); + if (IS_VALLEYVIEW(i915)) { + error->gtier[0] = intel_uncore_read(uncore, GTIER); + error->ier = intel_uncore_read(uncore, VLV_IER); + error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_VLV); } - if (IS_GEN(dev_priv, 7)) - error->err_int = I915_READ(GEN7_ERR_INT); - - if (INTEL_GEN(dev_priv) >= 8) { - error->fault_data0 = I915_READ(GEN8_FAULT_TLB_DATA0); - error->fault_data1 = I915_READ(GEN8_FAULT_TLB_DATA1); + if (IS_GEN(i915, 7)) + error->err_int = intel_uncore_read(uncore, GEN7_ERR_INT); + + if (INTEL_GEN(i915) >= 12) { + error->fault_data0 = intel_uncore_read(uncore, + GEN12_FAULT_TLB_DATA0); + error->fault_data1 = intel_uncore_read(uncore, + GEN12_FAULT_TLB_DATA1); + } else if (INTEL_GEN(i915) >= 8) { + error->fault_data0 = intel_uncore_read(uncore, + GEN8_FAULT_TLB_DATA0); + error->fault_data1 = intel_uncore_read(uncore, + GEN8_FAULT_TLB_DATA1); } - if (IS_GEN(dev_priv, 6)) { - error->forcewake = I915_READ_FW(FORCEWAKE); - error->gab_ctl = I915_READ(GAB_CTL); - error->gfx_mode = I915_READ(GFX_MODE); + if (IS_GEN(i915, 6)) { + error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE); + error->gab_ctl = intel_uncore_read(uncore, GAB_CTL); + error->gfx_mode = intel_uncore_read(uncore, GFX_MODE); } /* 2: Registers which belong to multiple generations */ - if (INTEL_GEN(dev_priv) >= 7) - error->forcewake = I915_READ_FW(FORCEWAKE_MT); - - if (INTEL_GEN(dev_priv) >= 6) { - error->derrmr = I915_READ(DERRMR); - error->error = I915_READ(ERROR_GEN6); - error->done_reg = I915_READ(DONE_REG); + if (INTEL_GEN(i915) >= 7) + error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_MT); + + if (INTEL_GEN(i915) >= 6) { + error->derrmr = intel_uncore_read(uncore, DERRMR); + if (INTEL_GEN(i915) < 12) { + error->error = intel_uncore_read(uncore, ERROR_GEN6); + error->done_reg = intel_uncore_read(uncore, DONE_REG); + } } - if (INTEL_GEN(dev_priv) >= 5) - error->ccid = I915_READ(CCID); + if (INTEL_GEN(i915) >= 5) + error->ccid = intel_uncore_read(uncore, CCID(RENDER_RING_BASE)); /* 3: Feature specific registers */ - if (IS_GEN_RANGE(dev_priv, 6, 7)) { - error->gam_ecochk = I915_READ(GAM_ECOCHK); - error->gac_eco = I915_READ(GAC_ECO_BITS); + if (IS_GEN_RANGE(i915, 6, 7)) { + error->gam_ecochk = intel_uncore_read(uncore, GAM_ECOCHK); + error->gac_eco = intel_uncore_read(uncore, GAC_ECO_BITS); + } + + if (IS_GEN_RANGE(i915, 8, 11)) + error->gtt_cache = intel_uncore_read(uncore, HSW_GTT_CACHE_EN); + + if (IS_GEN(i915, 12)) + error->aux_err = intel_uncore_read(uncore, GEN12_AUX_ERR_DBG); + + if (INTEL_GEN(i915) >= 12) { + for (i = 0; i < GEN12_SFC_DONE_MAX; i++) { + error->sfc_done[i] = + intel_uncore_read(uncore, GEN12_SFC_DONE(i)); + } + + error->gam_done = intel_uncore_read(uncore, GEN12_GAM_DONE); } /* 4: Everything else */ - if (INTEL_GEN(dev_priv) >= 11) { - error->ier = I915_READ(GEN8_DE_MISC_IER); - error->gtier[0] = I915_READ(GEN11_RENDER_COPY_INTR_ENABLE); - error->gtier[1] = I915_READ(GEN11_VCS_VECS_INTR_ENABLE); - error->gtier[2] = I915_READ(GEN11_GUC_SG_INTR_ENABLE); - error->gtier[3] = I915_READ(GEN11_GPM_WGBOXPERF_INTR_ENABLE); - error->gtier[4] = I915_READ(GEN11_CRYPTO_RSVD_INTR_ENABLE); - error->gtier[5] = I915_READ(GEN11_GUNIT_CSME_INTR_ENABLE); + if (INTEL_GEN(i915) >= 11) { + error->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER); + error->gtier[0] = + intel_uncore_read(uncore, + GEN11_RENDER_COPY_INTR_ENABLE); + error->gtier[1] = + intel_uncore_read(uncore, GEN11_VCS_VECS_INTR_ENABLE); + error->gtier[2] = + intel_uncore_read(uncore, GEN11_GUC_SG_INTR_ENABLE); + error->gtier[3] = + intel_uncore_read(uncore, + GEN11_GPM_WGBOXPERF_INTR_ENABLE); + error->gtier[4] = + intel_uncore_read(uncore, + GEN11_CRYPTO_RSVD_INTR_ENABLE); + error->gtier[5] = + intel_uncore_read(uncore, + GEN11_GUNIT_CSME_INTR_ENABLE); error->ngtier = 6; - } else if (INTEL_GEN(dev_priv) >= 8) { - error->ier = I915_READ(GEN8_DE_MISC_IER); + } else if (INTEL_GEN(i915) >= 8) { + error->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER); for (i = 0; i < 4; i++) - error->gtier[i] = I915_READ(GEN8_GT_IER(i)); + error->gtier[i] = intel_uncore_read(uncore, + GEN8_GT_IER(i)); error->ngtier = 4; - } else if (HAS_PCH_SPLIT(dev_priv)) { - error->ier = I915_READ(DEIER); - error->gtier[0] = I915_READ(GTIER); + } else if (HAS_PCH_SPLIT(i915)) { + error->ier = intel_uncore_read(uncore, DEIER); + error->gtier[0] = intel_uncore_read(uncore, GTIER); error->ngtier = 1; - } else if (IS_GEN(dev_priv, 2)) { - error->ier = I915_READ16(IER); - } else if (!IS_VALLEYVIEW(dev_priv)) { - error->ier = I915_READ(IER); + } else if (IS_GEN(i915, 2)) { + error->ier = intel_uncore_read16(uncore, GEN2_IER); + } else if (!IS_VALLEYVIEW(i915)) { + error->ier = intel_uncore_read(uncore, GEN2_IER); } - error->eir = I915_READ(EIR); - error->pgtbl_er = I915_READ(PGTBL_ER); + error->eir = intel_uncore_read(uncore, EIR); + error->pgtbl_er = intel_uncore_read(uncore, PGTBL_ER); } static const char * -error_msg(struct i915_gpu_state *error, unsigned long engines, const char *msg) +error_msg(struct i915_gpu_state *error, + intel_engine_mask_t engines, const char *msg) { int len; - int i; - - for (i = 0; i < ARRAY_SIZE(error->engine); i++) - if (!error->engine[i].context.pid) - engines &= ~BIT(i); len = scnprintf(error->error_msg, sizeof(error->error_msg), - "GPU HANG: ecode %d:%lx:0x%08x", + "GPU HANG: ecode %d:%x:0x%08x", INTEL_GEN(error->i915), engines, - i915_error_generate_code(error, engines)); - if (engines) { + i915_error_generate_code(error)); + if (error->engine) { /* Just show the first executing process, more is confusing */ - i = __ffs(engines); len += scnprintf(error->error_msg + len, sizeof(error->error_msg) - len, ", in %s [%d]", - error->engine[i].context.comm, - error->engine[i].context.pid); + error->engine->context.comm, + error->engine->context.pid); } if (msg) len += scnprintf(error->error_msg + len, @@ -1765,56 +1712,15 @@ static void capture_params(struct i915_gpu_state *error) i915_params_copy(&error->params, &i915_modparams); } -static unsigned long capture_find_epoch(const struct i915_gpu_state *error) -{ - unsigned long epoch = error->capture; - int i; - - for (i = 0; i < ARRAY_SIZE(error->engine); i++) { - const struct drm_i915_error_engine *ee = &error->engine[i]; - - if (ee->hangcheck_timestamp && - time_before(ee->hangcheck_timestamp, epoch)) - epoch = ee->hangcheck_timestamp; - } - - return epoch; -} - static void capture_finish(struct i915_gpu_state *error) { struct i915_ggtt *ggtt = &error->i915->ggtt; - const u64 slot = ggtt->error_capture.start; - - ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE); -} - -static int capture(void *data) -{ - struct i915_gpu_state *error = data; - - error->time = ktime_get_real(); - error->boottime = ktime_get_boottime(); - error->uptime = ktime_sub(ktime_get(), - error->i915->gt.last_init_time); - error->capture = jiffies; - capture_params(error); - capture_gen_state(error); - capture_uc_state(error); - capture_reg_state(error); - gem_record_fences(error); - gem_record_rings(error); - capture_active_buffers(error); - capture_pinned_buffers(error); - - error->overlay = intel_overlay_capture_error_state(error->i915); - error->display = intel_display_capture_error_state(error->i915); + if (drm_mm_node_allocated(&ggtt->error_capture)) { + const u64 slot = ggtt->error_capture.start; - error->epoch = capture_find_epoch(error); - - capture_finish(error); - return 0; + ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE); + } } #define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x)) @@ -1823,22 +1729,45 @@ struct i915_gpu_state * i915_capture_gpu_state(struct drm_i915_private *i915) { struct i915_gpu_state *error; + struct compress compress; /* Check if GPU capture has been disabled */ error = READ_ONCE(i915->gpu_error.first_error); if (IS_ERR(error)) return error; - error = kzalloc(sizeof(*error), GFP_ATOMIC); + error = kzalloc(sizeof(*error), ALLOW_FAIL); if (!error) { i915_disable_error_state(i915, -ENOMEM); return ERR_PTR(-ENOMEM); } + if (!compress_init(&compress)) { + kfree(error); + i915_disable_error_state(i915, -ENOMEM); + return ERR_PTR(-ENOMEM); + } + kref_init(&error->ref); error->i915 = i915; - stop_machine(capture, error, NULL); + error->time = ktime_get_real(); + error->boottime = ktime_get_boottime(); + error->uptime = ktime_sub(ktime_get(), i915->gt.last_init_time); + error->capture = jiffies; + + capture_params(error); + capture_gen_state(error); + capture_uc_state(error, &compress); + capture_reg_state(error); + gem_record_fences(error); + gem_record_rings(error, &compress); + + error->overlay = intel_overlay_capture_error_state(i915); + error->display = intel_display_capture_error_state(i915); + + capture_finish(error); + compress_fini(&compress); return error; } @@ -1855,7 +1784,7 @@ i915_capture_gpu_state(struct drm_i915_private *i915) * to pick up. */ void i915_capture_error_state(struct drm_i915_private *i915, - unsigned long engine_mask, + intel_engine_mask_t engine_mask, const char *msg) { static bool warned; @@ -1888,15 +1817,14 @@ void i915_capture_error_state(struct drm_i915_private *i915, return; } - if (!warned && + if (!xchg(&warned, true) && ktime_get_real_seconds() - DRIVER_TIMESTAMP < DAY_AS_SECONDS(180)) { - DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n"); - DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n"); - DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n"); - DRM_INFO("The gpu crash dump is required to analyze gpu hangs, so please always attach it.\n"); - DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n", - i915->drm.primary->index); - warned = true; + pr_info("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n"); + pr_info("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n"); + pr_info("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n"); + pr_info("The GPU crash dump is required to analyze GPU hangs, so please always attach it.\n"); + pr_info("GPU crash dump saved to /sys/class/drm/card%d/error\n", + i915->drm.primary->index); } } |
