aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/i915/i915_gpu_error.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/i915/i915_gpu_error.c')
-rw-r--r--drivers/gpu/drm/i915/i915_gpu_error.c1228
1 files changed, 578 insertions, 650 deletions
diff --git a/drivers/gpu/drm/i915/i915_gpu_error.c b/drivers/gpu/drm/i915/i915_gpu_error.c
index aa6791255252..3c85cb0ee99f 100644
--- a/drivers/gpu/drm/i915/i915_gpu_error.c
+++ b/drivers/gpu/drm/i915/i915_gpu_error.c
@@ -29,56 +29,27 @@
#include <linux/ascii85.h>
#include <linux/nmi.h>
+#include <linux/pagevec.h>
#include <linux/scatterlist.h>
-#include <linux/stop_machine.h>
#include <linux/utsname.h>
#include <linux/zlib.h>
#include <drm/drm_print.h>
-#include "i915_gpu_error.h"
-#include "i915_drv.h"
-
-static inline const struct intel_engine_cs *
-engine_lookup(const struct drm_i915_private *i915, unsigned int id)
-{
- if (id >= I915_NUM_ENGINES)
- return NULL;
-
- return i915->engine[id];
-}
-
-static inline const char *
-__engine_name(const struct intel_engine_cs *engine)
-{
- return engine ? engine->name : "";
-}
+#include "display/intel_atomic.h"
+#include "display/intel_overlay.h"
-static const char *
-engine_name(const struct drm_i915_private *i915, unsigned int id)
-{
- return __engine_name(engine_lookup(i915, id));
-}
+#include "gem/i915_gem_context.h"
+#include "gem/i915_gem_lmem.h"
-static const char *tiling_flag(int tiling)
-{
- switch (tiling) {
- default:
- case I915_TILING_NONE: return "";
- case I915_TILING_X: return " X";
- case I915_TILING_Y: return " Y";
- }
-}
-
-static const char *dirty_flag(int dirty)
-{
- return dirty ? " dirty" : "";
-}
+#include "i915_drv.h"
+#include "i915_gpu_error.h"
+#include "i915_memcpy.h"
+#include "i915_scatterlist.h"
+#include "intel_csr.h"
-static const char *purgeable_flag(int purgeable)
-{
- return purgeable ? " purgeable" : "";
-}
+#define ALLOW_FAIL (GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN)
+#define ATOMIC_MAYFAIL (GFP_ATOMIC | __GFP_NOWARN)
static void __sg_set_buf(struct scatterlist *sg,
void *addr, unsigned int len, loff_t it)
@@ -107,7 +78,7 @@ static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len)
if (e->cur == e->end) {
struct scatterlist *sgl;
- sgl = (typeof(sgl))__get_free_page(GFP_KERNEL);
+ sgl = (typeof(sgl))__get_free_page(ALLOW_FAIL);
if (!sgl) {
e->err = -ENOMEM;
return false;
@@ -127,7 +98,7 @@ static bool __i915_error_grow(struct drm_i915_error_state_buf *e, size_t len)
}
e->size = ALIGN(len + 1, SZ_64K);
- e->buf = kmalloc(e->size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
+ e->buf = kmalloc(e->size, ALLOW_FAIL);
if (!e->buf) {
e->size = PAGE_ALIGN(len + 1);
e->buf = kmalloc(e->size, GFP_KERNEL);
@@ -204,47 +175,116 @@ i915_error_printer(struct drm_i915_error_state_buf *e)
return p;
}
+/* single threaded page allocator with a reserved stash for emergencies */
+static void pool_fini(struct pagevec *pv)
+{
+ pagevec_release(pv);
+}
+
+static int pool_refill(struct pagevec *pv, gfp_t gfp)
+{
+ while (pagevec_space(pv)) {
+ struct page *p;
+
+ p = alloc_page(gfp);
+ if (!p)
+ return -ENOMEM;
+
+ pagevec_add(pv, p);
+ }
+
+ return 0;
+}
+
+static int pool_init(struct pagevec *pv, gfp_t gfp)
+{
+ int err;
+
+ pagevec_init(pv);
+
+ err = pool_refill(pv, gfp);
+ if (err)
+ pool_fini(pv);
+
+ return err;
+}
+
+static void *pool_alloc(struct pagevec *pv, gfp_t gfp)
+{
+ struct page *p;
+
+ p = alloc_page(gfp);
+ if (!p && pagevec_count(pv))
+ p = pv->pages[--pv->nr];
+
+ return p ? page_address(p) : NULL;
+}
+
+static void pool_free(struct pagevec *pv, void *addr)
+{
+ struct page *p = virt_to_page(addr);
+
+ if (pagevec_space(pv))
+ pagevec_add(pv, p);
+ else
+ __free_page(p);
+}
+
#ifdef CONFIG_DRM_I915_COMPRESS_ERROR
struct compress {
+ struct pagevec pool;
struct z_stream_s zstream;
void *tmp;
+ bool wc;
};
static bool compress_init(struct compress *c)
{
- struct z_stream_s *zstream = memset(&c->zstream, 0, sizeof(c->zstream));
+ struct z_stream_s *zstream = &c->zstream;
- zstream->workspace =
- kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
- GFP_ATOMIC | __GFP_NOWARN);
- if (!zstream->workspace)
+ if (pool_init(&c->pool, ALLOW_FAIL))
return false;
- if (zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) != Z_OK) {
- kfree(zstream->workspace);
+ zstream->workspace =
+ kmalloc(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
+ ALLOW_FAIL);
+ if (!zstream->workspace) {
+ pool_fini(&c->pool);
return false;
}
c->tmp = NULL;
if (i915_has_memcpy_from_wc())
- c->tmp = (void *)__get_free_page(GFP_ATOMIC | __GFP_NOWARN);
+ c->tmp = pool_alloc(&c->pool, ALLOW_FAIL);
return true;
}
-static void *compress_next_page(struct drm_i915_error_object *dst)
+static bool compress_start(struct compress *c)
+{
+ struct z_stream_s *zstream = &c->zstream;
+ void *workspace = zstream->workspace;
+
+ memset(zstream, 0, sizeof(*zstream));
+ zstream->workspace = workspace;
+
+ return zlib_deflateInit(zstream, Z_DEFAULT_COMPRESSION) == Z_OK;
+}
+
+static void *compress_next_page(struct compress *c,
+ struct drm_i915_error_object *dst)
{
- unsigned long page;
+ void *page;
if (dst->page_count >= dst->num_pages)
return ERR_PTR(-ENOSPC);
- page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
+ page = pool_alloc(&c->pool, ALLOW_FAIL);
if (!page)
return ERR_PTR(-ENOMEM);
- return dst->pages[dst->page_count++] = (void *)page;
+ return dst->pages[dst->page_count++] = page;
}
static int compress_page(struct compress *c,
@@ -254,13 +294,13 @@ static int compress_page(struct compress *c,
struct z_stream_s *zstream = &c->zstream;
zstream->next_in = src;
- if (c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE))
+ if (c->wc && c->tmp && i915_memcpy_from_wc(c->tmp, src, PAGE_SIZE))
zstream->next_in = c->tmp;
zstream->avail_in = PAGE_SIZE;
do {
if (zstream->avail_out == 0) {
- zstream->next_out = compress_next_page(dst);
+ zstream->next_out = compress_next_page(c, dst);
if (IS_ERR(zstream->next_out))
return PTR_ERR(zstream->next_out);
@@ -269,8 +309,6 @@ static int compress_page(struct compress *c,
if (zlib_deflate(zstream, Z_NO_FLUSH) != Z_OK)
return -EIO;
-
- touch_nmi_watchdog();
} while (zstream->avail_in);
/* Fallback to uncompressed if we increase size? */
@@ -288,7 +326,7 @@ static int compress_flush(struct compress *c,
do {
switch (zlib_deflate(zstream, Z_FINISH)) {
case Z_OK: /* more space requested */
- zstream->next_out = compress_next_page(dst);
+ zstream->next_out = compress_next_page(c, dst);
if (IS_ERR(zstream->next_out))
return PTR_ERR(zstream->next_out);
@@ -309,15 +347,17 @@ end:
return 0;
}
-static void compress_fini(struct compress *c,
- struct drm_i915_error_object *dst)
+static void compress_finish(struct compress *c)
{
- struct z_stream_s *zstream = &c->zstream;
+ zlib_deflateEnd(&c->zstream);
+}
- zlib_deflateEnd(zstream);
- kfree(zstream->workspace);
+static void compress_fini(struct compress *c)
+{
+ kfree(c->zstream.workspace);
if (c->tmp)
- free_page((unsigned long)c->tmp);
+ pool_free(&c->pool, c->tmp);
+ pool_fini(&c->pool);
}
static void err_compression_marker(struct drm_i915_error_state_buf *m)
@@ -328,10 +368,17 @@ static void err_compression_marker(struct drm_i915_error_state_buf *m)
#else
struct compress {
+ struct pagevec pool;
+ bool wc;
};
static bool compress_init(struct compress *c)
{
+ return pool_init(&c->pool, ALLOW_FAIL) == 0;
+}
+
+static bool compress_start(struct compress *c)
+{
return true;
}
@@ -339,15 +386,13 @@ static int compress_page(struct compress *c,
void *src,
struct drm_i915_error_object *dst)
{
- unsigned long page;
void *ptr;
- page = __get_free_page(GFP_ATOMIC | __GFP_NOWARN);
- if (!page)
+ ptr = pool_alloc(&c->pool, ALLOW_FAIL);
+ if (!ptr)
return -ENOMEM;
- ptr = (void *)page;
- if (!i915_memcpy_from_wc(ptr, src, PAGE_SIZE))
+ if (!(c->wc && i915_memcpy_from_wc(ptr, src, PAGE_SIZE)))
memcpy(ptr, src, PAGE_SIZE);
dst->pages[dst->page_count++] = ptr;
@@ -360,11 +405,15 @@ static int compress_flush(struct compress *c,
return 0;
}
-static void compress_fini(struct compress *c,
- struct drm_i915_error_object *dst)
+static void compress_finish(struct compress *c)
{
}
+static void compress_fini(struct compress *c)
+{
+ pool_fini(&c->pool);
+}
+
static void err_compression_marker(struct drm_i915_error_state_buf *m)
{
err_puts(m, "~");
@@ -372,49 +421,17 @@ static void err_compression_marker(struct drm_i915_error_state_buf *m)
#endif
-static void print_error_buffers(struct drm_i915_error_state_buf *m,
- const char *name,
- struct drm_i915_error_buffer *err,
- int count)
-{
- err_printf(m, "%s [%d]:\n", name, count);
-
- while (count--) {
- err_printf(m, " %08x_%08x %8u %02x %02x %02x",
- upper_32_bits(err->gtt_offset),
- lower_32_bits(err->gtt_offset),
- err->size,
- err->read_domains,
- err->write_domain,
- err->wseqno);
- err_puts(m, tiling_flag(err->tiling));
- err_puts(m, dirty_flag(err->dirty));
- err_puts(m, purgeable_flag(err->purgeable));
- err_puts(m, err->userptr ? " userptr" : "");
- err_puts(m, err->engine != -1 ? " " : "");
- err_puts(m, engine_name(m->i915, err->engine));
- err_puts(m, i915_cache_level_str(m->i915, err->cache_level));
-
- if (err->name)
- err_printf(m, " (name: %d)", err->name);
- if (err->fence_reg != I915_FENCE_REG_NONE)
- err_printf(m, " (fence: %d)", err->fence_reg);
-
- err_puts(m, "\n");
- err++;
- }
-}
-
static void error_print_instdone(struct drm_i915_error_state_buf *m,
const struct drm_i915_error_engine *ee)
{
+ const struct sseu_dev_info *sseu = &RUNTIME_INFO(m->i915)->sseu;
int slice;
int subslice;
err_printf(m, " INSTDONE: 0x%08x\n",
ee->instdone.instdone);
- if (ee->engine_id != RCS || INTEL_GEN(m->i915) <= 3)
+ if (ee->engine->class != RENDER_CLASS || INTEL_GEN(m->i915) <= 3)
return;
err_printf(m, " SC_INSTDONE: 0x%08x\n",
@@ -423,22 +440,17 @@ static void error_print_instdone(struct drm_i915_error_state_buf *m,
if (INTEL_GEN(m->i915) <= 6)
return;
- for_each_instdone_slice_subslice(m->i915, slice, subslice)
+ for_each_instdone_slice_subslice(m->i915, sseu, slice, subslice)
err_printf(m, " SAMPLER_INSTDONE[%d][%d]: 0x%08x\n",
slice, subslice,
ee->instdone.sampler[slice][subslice]);
- for_each_instdone_slice_subslice(m->i915, slice, subslice)
+ for_each_instdone_slice_subslice(m->i915, sseu, slice, subslice)
err_printf(m, " ROW_INSTDONE[%d][%d]: 0x%08x\n",
slice, subslice,
ee->instdone.row[slice][subslice]);
}
-static const char *bannable(const struct drm_i915_error_context *ctx)
-{
- return ctx->bannable ? "" : " (unbannable)";
-}
-
static void error_print_request(struct drm_i915_error_state_buf *m,
const char *prefix,
const struct drm_i915_error_request *erq,
@@ -447,9 +459,8 @@ static void error_print_request(struct drm_i915_error_state_buf *m,
if (!erq->seqno)
return;
- err_printf(m, "%s pid %d, ban score %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n",
- prefix, erq->pid, erq->ban_score,
- erq->context, erq->seqno,
+ err_printf(m, "%s pid %d, seqno %8x:%08x%s%s, prio %d, emitted %dms, start %08x, head %08x, tail %08x\n",
+ prefix, erq->pid, erq->context, erq->seqno,
test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
&erq->flags) ? "!" : "",
test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
@@ -463,9 +474,8 @@ static void error_print_context(struct drm_i915_error_state_buf *m,
const char *header,
const struct drm_i915_error_context *ctx)
{
- err_printf(m, "%s%s[%d] user_handle %d hw_id %d, prio %d, ban score %d%s guilty %d active %d\n",
- header, ctx->comm, ctx->pid, ctx->handle, ctx->hw_id,
- ctx->sched_attr.priority, ctx->ban_score, bannable(ctx),
+ err_printf(m, "%s%s[%d] prio %d, guilty %d active %d\n",
+ header, ctx->comm, ctx->pid, ctx->sched_attr.priority,
ctx->guilty, ctx->active);
}
@@ -475,8 +485,7 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
{
int n;
- err_printf(m, "%s command stream:\n",
- engine_name(m->i915, ee->engine_id));
+ err_printf(m, "%s command stream:\n", ee->engine->name);
err_printf(m, " IDLE?: %s\n", yesno(ee->idle));
err_printf(m, " START: 0x%08x\n", ee->start);
err_printf(m, " HEAD: 0x%08x [0x%08x]\n", ee->head, ee->rq_head);
@@ -512,13 +521,6 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
if (INTEL_GEN(m->i915) >= 6) {
err_printf(m, " RC PSMI: 0x%08x\n", ee->rc_psmi);
err_printf(m, " FAULT_REG: 0x%08x\n", ee->fault_reg);
- err_printf(m, " SYNC_0: 0x%08x\n",
- ee->semaphore_mboxes[0]);
- err_printf(m, " SYNC_1: 0x%08x\n",
- ee->semaphore_mboxes[1]);
- if (HAS_VEBOX(m->i915))
- err_printf(m, " SYNC_2: 0x%08x\n",
- ee->semaphore_mboxes[2]);
}
if (HAS_PPGTT(m->i915)) {
err_printf(m, " GFX_MODE: 0x%08x\n", ee->vm_info.gfx_mode);
@@ -533,14 +535,8 @@ static void error_print_engine(struct drm_i915_error_state_buf *m,
ee->vm_info.pp_dir_base);
}
}
- err_printf(m, " seqno: 0x%08x\n", ee->seqno);
- err_printf(m, " last_seqno: 0x%08x\n", ee->last_seqno);
err_printf(m, " ring->head: 0x%08x\n", ee->cpu_ring_head);
err_printf(m, " ring->tail: 0x%08x\n", ee->cpu_ring_tail);
- err_printf(m, " hangcheck timestamp: %dms (%lu%s)\n",
- jiffies_to_msecs(ee->hangcheck_timestamp - epoch),
- ee->hangcheck_timestamp,
- ee->hangcheck_timestamp == epoch ? "; epoch" : "");
err_printf(m, " engine reset count: %u\n", ee->reset_count);
for (n = 0; n < ee->num_ports; n++) {
@@ -561,9 +557,9 @@ void i915_error_printf(struct drm_i915_error_state_buf *e, const char *f, ...)
}
static void print_error_obj(struct drm_i915_error_state_buf *m,
- struct intel_engine_cs *engine,
+ const struct intel_engine_cs *engine,
const char *name,
- struct drm_i915_error_object *obj)
+ const struct drm_i915_error_object *obj)
{
char out[ASCII85_BUFSZ];
int page;
@@ -578,6 +574,9 @@ static void print_error_obj(struct drm_i915_error_state_buf *m,
lower_32_bits(obj->gtt_offset));
}
+ if (obj->gtt_page_sizes > I915_GTT_PAGE_SIZE_4K)
+ err_printf(m, "gtt_page_sizes = 0x%08x\n", obj->gtt_page_sizes);
+
err_compression_marker(m);
for (page = 0; page < obj->page_count; page++) {
int i, len;
@@ -632,7 +631,7 @@ static void err_print_uc(struct drm_i915_error_state_buf *m,
const struct i915_gpu_state *error =
container_of(error_uc, typeof(*error), uc);
- if (!error->device_info.has_guc)
+ if (!error->device_info.has_gt_uc)
return;
intel_uc_fw_dump(&error_uc->guc_fw, &p);
@@ -660,7 +659,7 @@ static void err_free_sgl(struct scatterlist *sgl)
static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
struct i915_gpu_state *error)
{
- struct drm_i915_error_object *obj;
+ const struct drm_i915_error_engine *ee;
struct timespec64 ts;
int i, j;
@@ -669,6 +668,7 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
err_printf(m, "Kernel: %s %s\n",
init_utsname()->release,
init_utsname()->machine);
+ err_printf(m, "Driver: %s\n", DRIVER_DATE);
ts = ktime_to_timespec64(error->time);
err_printf(m, "Time: %lld s %ld us\n",
(s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
@@ -678,26 +678,21 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
ts = ktime_to_timespec64(error->uptime);
err_printf(m, "Uptime: %lld s %ld us\n",
(s64)ts.tv_sec, ts.tv_nsec / NSEC_PER_USEC);
- err_printf(m, "Epoch: %lu jiffies (%u HZ)\n", error->epoch, HZ);
- err_printf(m, "Capture: %lu jiffies; %d ms ago, %d ms after epoch\n",
- error->capture,
- jiffies_to_msecs(jiffies - error->capture),
- jiffies_to_msecs(error->capture - error->epoch));
-
- for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
- if (!error->engine[i].context.pid)
- continue;
+ err_printf(m, "Capture: %lu jiffies; %d ms ago\n",
+ error->capture, jiffies_to_msecs(jiffies - error->capture));
+
+ for (ee = error->engine; ee; ee = ee->next)
+ err_printf(m, "Active process (on ring %s): %s [%d]\n",
+ ee->engine->name,
+ ee->context.comm,
+ ee->context.pid);
- err_printf(m, "Active process (on ring %s): %s [%d], score %d%s\n",
- engine_name(m->i915, i),
- error->engine[i].context.comm,
- error->engine[i].context.pid,
- error->engine[i].context.ban_score,
- bannable(&error->engine[i].context));
- }
err_printf(m, "Reset count: %u\n", error->reset_count);
err_printf(m, "Suspend count: %u\n", error->suspend_count);
err_printf(m, "Platform: %s\n", intel_platform_name(error->device_info.platform));
+ err_printf(m, "Subplatform: 0x%x\n",
+ intel_subplatform(&error->runtime_info,
+ error->device_info.platform));
err_print_pciid(m, m->i915);
err_printf(m, "IOMMU enabled?: %d\n", error->iommu);
@@ -727,101 +722,73 @@ static void __err_print_to_sgl(struct drm_i915_error_state_buf *m,
for (i = 0; i < error->nfence; i++)
err_printf(m, " fence[%d] = %08llx\n", i, error->fence[i]);
- if (INTEL_GEN(m->i915) >= 6) {
+ if (IS_GEN_RANGE(m->i915, 6, 11)) {
err_printf(m, "ERROR: 0x%08x\n", error->error);
-
- if (INTEL_GEN(m->i915) >= 8)
- err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n",
- error->fault_data1, error->fault_data0);
-
err_printf(m, "DONE_REG: 0x%08x\n", error->done_reg);
}
+ if (INTEL_GEN(m->i915) >= 8)
+ err_printf(m, "FAULT_TLB_DATA: 0x%08x 0x%08x\n",
+ error->fault_data1, error->fault_data0);
+
if (IS_GEN(m->i915, 7))
err_printf(m, "ERR_INT: 0x%08x\n", error->err_int);
- for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
- if (error->engine[i].engine_id != -1)
- error_print_engine(m, &error->engine[i], error->epoch);
- }
+ if (IS_GEN_RANGE(m->i915, 8, 11))
+ err_printf(m, "GTT_CACHE_EN: 0x%08x\n", error->gtt_cache);
- for (i = 0; i < ARRAY_SIZE(error->active_vm); i++) {
- char buf[128];
- int len, first = 1;
+ if (IS_GEN(m->i915, 12))
+ err_printf(m, "AUX_ERR_DBG: 0x%08x\n", error->aux_err);
- if (!error->active_vm[i])
- break;
+ if (INTEL_GEN(m->i915) >= 12) {
+ int i;
- len = scnprintf(buf, sizeof(buf), "Active (");
- for (j = 0; j < ARRAY_SIZE(error->engine); j++) {
- if (error->engine[j].vm != error->active_vm[i])
- continue;
+ for (i = 0; i < GEN12_SFC_DONE_MAX; i++)
+ err_printf(m, " SFC_DONE[%d]: 0x%08x\n", i,
+ error->sfc_done[i]);
- len += scnprintf(buf + len, sizeof(buf), "%s%s",
- first ? "" : ", ",
- m->i915->engine[j]->name);
- first = 0;
- }
- scnprintf(buf + len, sizeof(buf), ")");
- print_error_buffers(m, buf,
- error->active_bo[i],
- error->active_bo_count[i]);
+ err_printf(m, " GAM_DONE: 0x%08x\n", error->gam_done);
}
- print_error_buffers(m, "Pinned (global)",
- error->pinned_bo,
- error->pinned_bo_count);
+ for (ee = error->engine; ee; ee = ee->next)
+ error_print_engine(m, ee, error->capture);
- for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
- const struct drm_i915_error_engine *ee = &error->engine[i];
+ for (ee = error->engine; ee; ee = ee->next) {
+ const struct drm_i915_error_object *obj;
obj = ee->batchbuffer;
if (obj) {
- err_puts(m, m->i915->engine[i]->name);
+ err_puts(m, ee->engine->name);
if (ee->context.pid)
- err_printf(m, " (submitted by %s [%d], ctx %d [%d], score %d%s)",
+ err_printf(m, " (submitted by %s [%d])",
ee->context.comm,
- ee->context.pid,
- ee->context.handle,
- ee->context.hw_id,
- ee->context.ban_score,
- bannable(&ee->context));
+ ee->context.pid);
err_printf(m, " --- gtt_offset = 0x%08x %08x\n",
upper_32_bits(obj->gtt_offset),
lower_32_bits(obj->gtt_offset));
- print_error_obj(m, m->i915->engine[i], NULL, obj);
+ print_error_obj(m, ee->engine, NULL, obj);
}
for (j = 0; j < ee->user_bo_count; j++)
- print_error_obj(m, m->i915->engine[i],
- "user", ee->user_bo[j]);
+ print_error_obj(m, ee->engine, "user", ee->user_bo[j]);
if (ee->num_requests) {
err_printf(m, "%s --- %d requests\n",
- m->i915->engine[i]->name,
+ ee->engine->name,
ee->num_requests);
for (j = 0; j < ee->num_requests; j++)
error_print_request(m, " ",
&ee->requests[j],
- error->epoch);
+ error->capture);
}
- print_error_obj(m, m->i915->engine[i],
- "ringbuffer", ee->ringbuffer);
-
- print_error_obj(m, m->i915->engine[i],
- "HW Status", ee->hws_page);
-
- print_error_obj(m, m->i915->engine[i],
- "HW context", ee->ctx);
-
- print_error_obj(m, m->i915->engine[i],
- "WA context", ee->wa_ctx);
-
- print_error_obj(m, m->i915->engine[i],
+ print_error_obj(m, ee->engine, "ringbuffer", ee->ringbuffer);
+ print_error_obj(m, ee->engine, "HW Status", ee->hws_page);
+ print_error_obj(m, ee->engine, "HW context", ee->ctx);
+ print_error_obj(m, ee->engine, "WA context", ee->wa_ctx);
+ print_error_obj(m, ee->engine,
"WA batchbuffer", ee->wa_batchbuffer);
-
- print_error_obj(m, m->i915->engine[i],
+ print_error_obj(m, ee->engine,
"NULL context", ee->default_state);
}
@@ -970,13 +937,15 @@ void __i915_gpu_state_free(struct kref *error_ref)
{
struct i915_gpu_state *error =
container_of(error_ref, typeof(*error), ref);
- long i, j;
+ long i;
- for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
- struct drm_i915_error_engine *ee = &error->engine[i];
+ while (error->engine) {
+ struct drm_i915_error_engine *ee = error->engine;
- for (j = 0; j < ee->user_bo_count; j++)
- i915_error_object_free(ee->user_bo[j]);
+ error->engine = ee->next;
+
+ for (i = 0; i < ee->user_bo_count; i++)
+ i915_error_object_free(ee->user_bo[i]);
kfree(ee->user_bo);
i915_error_object_free(ee->batchbuffer);
@@ -987,12 +956,9 @@ void __i915_gpu_state_free(struct kref *error_ref)
i915_error_object_free(ee->wa_ctx);
kfree(ee->requests);
+ kfree(ee);
}
- for (i = 0; i < ARRAY_SIZE(error->active_bo); i++)
- kfree(error->active_bo[i]);
- kfree(error->pinned_bo);
-
kfree(error->overlay);
kfree(error->display);
@@ -1005,130 +971,98 @@ void __i915_gpu_state_free(struct kref *error_ref)
static struct drm_i915_error_object *
i915_error_object_create(struct drm_i915_private *i915,
- struct i915_vma *vma)
+ struct i915_vma *vma,
+ struct compress *compress)
{
struct i915_ggtt *ggtt = &i915->ggtt;
const u64 slot = ggtt->error_capture.start;
struct drm_i915_error_object *dst;
- struct compress compress;
unsigned long num_pages;
struct sgt_iter iter;
- dma_addr_t dma;
int ret;
+ might_sleep();
+
if (!vma || !vma->pages)
return NULL;
num_pages = min_t(u64, vma->size, vma->obj->base.size) >> PAGE_SHIFT;
num_pages = DIV_ROUND_UP(10 * num_pages, 8); /* worstcase zlib growth */
- dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *),
- GFP_ATOMIC | __GFP_NOWARN);
+ dst = kmalloc(sizeof(*dst) + num_pages * sizeof(u32 *), ALLOW_FAIL);
if (!dst)
return NULL;
+ if (!compress_start(compress)) {
+ kfree(dst);
+ return NULL;
+ }
+
dst->gtt_offset = vma->node.start;
dst->gtt_size = vma->node.size;
+ dst->gtt_page_sizes = vma->page_sizes.gtt;
dst->num_pages = num_pages;
dst->page_count = 0;
dst->unused = 0;
- if (!compress_init(&compress)) {
- kfree(dst);
- return NULL;
- }
+ compress->wc = i915_gem_object_is_lmem(vma->obj) ||
+ drm_mm_node_allocated(&ggtt->error_capture);
ret = -EINVAL;
- for_each_sgt_dma(dma, iter, vma->pages) {
+ if (drm_mm_node_allocated(&ggtt->error_capture)) {
void __iomem *s;
+ dma_addr_t dma;
- ggtt->vm.insert_page(&ggtt->vm, dma, slot, I915_CACHE_NONE, 0);
-
- s = io_mapping_map_atomic_wc(&ggtt->iomap, slot);
- ret = compress_page(&compress, (void __force *)s, dst);
- io_mapping_unmap_atomic(s);
- if (ret)
- break;
- }
-
- if (ret || compress_flush(&compress, dst)) {
- while (dst->page_count--)
- free_page((unsigned long)dst->pages[dst->page_count]);
- kfree(dst);
- dst = NULL;
- }
-
- compress_fini(&compress, dst);
- return dst;
-}
-
-/* The error capture is special as tries to run underneath the normal
- * locking rules - so we use the raw version of the i915_active_request lookup.
- */
-static inline u32
-__active_get_seqno(struct i915_active_request *active)
-{
- struct i915_request *request;
-
- request = __i915_active_request_peek(active);
- return request ? request->global_seqno : 0;
-}
+ for_each_sgt_daddr(dma, iter, vma->pages) {
+ ggtt->vm.insert_page(&ggtt->vm, dma, slot,
+ I915_CACHE_NONE, 0);
-static inline int
-__active_get_engine_id(struct i915_active_request *active)
-{
- struct i915_request *request;
-
- request = __i915_active_request_peek(active);
- return request ? request->engine->id : -1;
-}
-
-static void capture_bo(struct drm_i915_error_buffer *err,
- struct i915_vma *vma)
-{
- struct drm_i915_gem_object *obj = vma->obj;
+ s = io_mapping_map_wc(&ggtt->iomap, slot, PAGE_SIZE);
+ ret = compress_page(compress, (void __force *)s, dst);
+ io_mapping_unmap(s);
+ if (ret)
+ break;
+ }
+ } else if (i915_gem_object_is_lmem(vma->obj)) {
+ struct intel_memory_region *mem = vma->obj->mm.region;
+ dma_addr_t dma;
- err->size = obj->base.size;
- err->name = obj->base.name;
+ for_each_sgt_daddr(dma, iter, vma->pages) {
+ void __iomem *s;
- err->wseqno = __active_get_seqno(&obj->frontbuffer_write);
- err->engine = __active_get_engine_id(&obj->frontbuffer_write);
+ s = io_mapping_map_wc(&mem->iomap, dma, PAGE_SIZE);
+ ret = compress_page(compress, (void __force *)s, dst);
+ io_mapping_unmap(s);
+ if (ret)
+ break;
+ }
+ } else {
+ struct page *page;
- err->gtt_offset = vma->node.start;
- err->read_domains = obj->read_domains;
- err->write_domain = obj->write_domain;
- err->fence_reg = vma->fence ? vma->fence->id : -1;
- err->tiling = i915_gem_object_get_tiling(obj);
- err->dirty = obj->mm.dirty;
- err->purgeable = obj->mm.madv != I915_MADV_WILLNEED;
- err->userptr = obj->userptr.mm != NULL;
- err->cache_level = obj->cache_level;
-}
+ for_each_sgt_page(page, iter, vma->pages) {
+ void *s;
-static u32 capture_error_bo(struct drm_i915_error_buffer *err,
- int count, struct list_head *head,
- unsigned int flags)
-#define ACTIVE_ONLY BIT(0)
-#define PINNED_ONLY BIT(1)
-{
- struct i915_vma *vma;
- int i = 0;
+ drm_clflush_pages(&page, 1);
- list_for_each_entry(vma, head, vm_link) {
- if (!vma->obj)
- continue;
+ s = kmap(page);
+ ret = compress_page(compress, s, dst);
+ kunmap(s);
- if (flags & ACTIVE_ONLY && !i915_vma_is_active(vma))
- continue;
+ drm_clflush_pages(&page, 1);
- if (flags & PINNED_ONLY && !i915_vma_is_pinned(vma))
- continue;
+ if (ret)
+ break;
+ }
+ }
- capture_bo(err++, vma);
- if (++i == count)
- break;
+ if (ret || compress_flush(compress, dst)) {
+ while (dst->page_count--)
+ pool_free(&compress->pool, dst->pages[dst->page_count]);
+ kfree(dst);
+ dst = NULL;
}
+ compress_finish(compress);
- return i;
+ return dst;
}
/*
@@ -1141,55 +1075,43 @@ static u32 capture_error_bo(struct drm_i915_error_buffer *err,
*
* It's only a small step better than a random number in its current form.
*/
-static u32 i915_error_generate_code(struct i915_gpu_state *error,
- unsigned long engine_mask)
+static u32 i915_error_generate_code(struct i915_gpu_state *error)
{
+ const struct drm_i915_error_engine *ee = error->engine;
+
/*
* IPEHR would be an ideal way to detect errors, as it's the gross
* measure of "the command that hung." However, has some very common
* synchronization commands which almost always appear in the case
* strictly a client bug. Use instdone to differentiate those some.
*/
- if (engine_mask) {
- struct drm_i915_error_engine *ee =
- &error->engine[ffs(engine_mask)];
-
- return ee->ipehr ^ ee->instdone.instdone;
- }
-
- return 0;
+ return ee ? ee->ipehr ^ ee->instdone.instdone : 0;
}
static void gem_record_fences(struct i915_gpu_state *error)
{
struct drm_i915_private *dev_priv = error->i915;
+ struct intel_uncore *uncore = &dev_priv->uncore;
int i;
if (INTEL_GEN(dev_priv) >= 6) {
- for (i = 0; i < dev_priv->num_fence_regs; i++)
- error->fence[i] = I915_READ64(FENCE_REG_GEN6_LO(i));
+ for (i = 0; i < dev_priv->ggtt.num_fences; i++)
+ error->fence[i] =
+ intel_uncore_read64(uncore,
+ FENCE_REG_GEN6_LO(i));
} else if (INTEL_GEN(dev_priv) >= 4) {
- for (i = 0; i < dev_priv->num_fence_regs; i++)
- error->fence[i] = I915_READ64(FENCE_REG_965_LO(i));
+ for (i = 0; i < dev_priv->ggtt.num_fences; i++)
+ error->fence[i] =
+ intel_uncore_read64(uncore,
+ FENCE_REG_965_LO(i));
} else {
- for (i = 0; i < dev_priv->num_fence_regs; i++)
- error->fence[i] = I915_READ(FENCE_REG(i));
+ for (i = 0; i < dev_priv->ggtt.num_fences; i++)
+ error->fence[i] =
+ intel_uncore_read(uncore, FENCE_REG(i));
}
error->nfence = i;
}
-static void gen6_record_semaphore_state(struct intel_engine_cs *engine,
- struct drm_i915_error_engine *ee)
-{
- struct drm_i915_private *dev_priv = engine->i915;
-
- ee->semaphore_mboxes[0] = I915_READ(RING_SYNC_0(engine->mmio_base));
- ee->semaphore_mboxes[1] = I915_READ(RING_SYNC_1(engine->mmio_base));
- if (HAS_VEBOX(dev_priv))
- ee->semaphore_mboxes[2] =
- I915_READ(RING_SYNC_2(engine->mmio_base));
-}
-
static void error_record_engine_registers(struct i915_gpu_state *error,
struct intel_engine_cs *engine,
struct drm_i915_error_engine *ee)
@@ -1197,44 +1119,43 @@ static void error_record_engine_registers(struct i915_gpu_state *error,
struct drm_i915_private *dev_priv = engine->i915;
if (INTEL_GEN(dev_priv) >= 6) {
- ee->rc_psmi = I915_READ(RING_PSMI_CTL(engine->mmio_base));
- if (INTEL_GEN(dev_priv) >= 8) {
+ ee->rc_psmi = ENGINE_READ(engine, RING_PSMI_CTL);
+
+ if (INTEL_GEN(dev_priv) >= 12)
+ ee->fault_reg = I915_READ(GEN12_RING_FAULT_REG);
+ else if (INTEL_GEN(dev_priv) >= 8)
ee->fault_reg = I915_READ(GEN8_RING_FAULT_REG);
- } else {
- gen6_record_semaphore_state(engine, ee);
- ee->fault_reg = I915_READ(RING_FAULT_REG(engine));
- }
+ else
+ ee->fault_reg = GEN6_RING_FAULT_REG_READ(engine);
}
if (INTEL_GEN(dev_priv) >= 4) {
- ee->faddr = I915_READ(RING_DMA_FADD(engine->mmio_base));
- ee->ipeir = I915_READ(RING_IPEIR(engine->mmio_base));
- ee->ipehr = I915_READ(RING_IPEHR(engine->mmio_base));
- ee->instps = I915_READ(RING_INSTPS(engine->mmio_base));
- ee->bbaddr = I915_READ(RING_BBADDR(engine->mmio_base));
+ ee->faddr = ENGINE_READ(engine, RING_DMA_FADD);
+ ee->ipeir = ENGINE_READ(engine, RING_IPEIR);
+ ee->ipehr = ENGINE_READ(engine, RING_IPEHR);
+ ee->instps = ENGINE_READ(engine, RING_INSTPS);
+ ee->bbaddr = ENGINE_READ(engine, RING_BBADDR);
if (INTEL_GEN(dev_priv) >= 8) {
- ee->faddr |= (u64) I915_READ(RING_DMA_FADD_UDW(engine->mmio_base)) << 32;
- ee->bbaddr |= (u64) I915_READ(RING_BBADDR_UDW(engine->mmio_base)) << 32;
+ ee->faddr |= (u64)ENGINE_READ(engine, RING_DMA_FADD_UDW) << 32;
+ ee->bbaddr |= (u64)ENGINE_READ(engine, RING_BBADDR_UDW) << 32;
}
- ee->bbstate = I915_READ(RING_BBSTATE(engine->mmio_base));
+ ee->bbstate = ENGINE_READ(engine, RING_BBSTATE);
} else {
- ee->faddr = I915_READ(DMA_FADD_I8XX);
- ee->ipeir = I915_READ(IPEIR);
- ee->ipehr = I915_READ(IPEHR);
+ ee->faddr = ENGINE_READ(engine, DMA_FADD_I8XX);
+ ee->ipeir = ENGINE_READ(engine, IPEIR);
+ ee->ipehr = ENGINE_READ(engine, IPEHR);
}
intel_engine_get_instdone(engine, &ee->instdone);
- ee->instpm = I915_READ(RING_INSTPM(engine->mmio_base));
+ ee->instpm = ENGINE_READ(engine, RING_INSTPM);
ee->acthd = intel_engine_get_active_head(engine);
- ee->seqno = intel_engine_get_seqno(engine);
- ee->last_seqno = intel_engine_last_submit(engine);
- ee->start = I915_READ_START(engine);
- ee->head = I915_READ_HEAD(engine);
- ee->tail = I915_READ_TAIL(engine);
- ee->ctl = I915_READ_CTL(engine);
+ ee->start = ENGINE_READ(engine, RING_START);
+ ee->head = ENGINE_READ(engine, RING_HEAD);
+ ee->tail = ENGINE_READ(engine, RING_TAIL);
+ ee->ctl = ENGINE_READ(engine, RING_CTL);
if (INTEL_GEN(dev_priv) > 2)
- ee->mode = I915_READ_MODE(engine);
+ ee->mode = ENGINE_READ(engine, RING_MI_MODE);
if (!HWS_NEEDS_PHYSICAL(dev_priv)) {
i915_reg_t mmio;
@@ -1242,16 +1163,18 @@ static void error_record_engine_registers(struct i915_gpu_state *error,
if (IS_GEN(dev_priv, 7)) {
switch (engine->id) {
default:
- case RCS:
+ MISSING_CASE(engine->id);
+ /* fall through */
+ case RCS0:
mmio = RENDER_HWS_PGA_GEN7;
break;
- case BCS:
+ case BCS0:
mmio = BLT_HWS_PGA_GEN7;
break;
- case VCS:
+ case VCS0:
mmio = BSD_HWS_PGA_GEN7;
break;
- case VECS:
+ case VECS0:
mmio = VEBOX_HWS_PGA_GEN7;
break;
}
@@ -1266,43 +1189,43 @@ static void error_record_engine_registers(struct i915_gpu_state *error,
}
ee->idle = intel_engine_is_idle(engine);
- if (!ee->idle)
- ee->hangcheck_timestamp = engine->hangcheck.action_timestamp;
ee->reset_count = i915_reset_engine_count(&dev_priv->gpu_error,
engine);
if (HAS_PPGTT(dev_priv)) {
int i;
- ee->vm_info.gfx_mode = I915_READ(RING_MODE_GEN7(engine));
+ ee->vm_info.gfx_mode = ENGINE_READ(engine, RING_MODE_GEN7);
- if (IS_GEN(dev_priv, 6))
+ if (IS_GEN(dev_priv, 6)) {
ee->vm_info.pp_dir_base =
- I915_READ(RING_PP_DIR_BASE_READ(engine));
- else if (IS_GEN(dev_priv, 7))
+ ENGINE_READ(engine, RING_PP_DIR_BASE_READ);
+ } else if (IS_GEN(dev_priv, 7)) {
ee->vm_info.pp_dir_base =
- I915_READ(RING_PP_DIR_BASE(engine));
- else if (INTEL_GEN(dev_priv) >= 8)
+ ENGINE_READ(engine, RING_PP_DIR_BASE);
+ } else if (INTEL_GEN(dev_priv) >= 8) {
+ u32 base = engine->mmio_base;
+
for (i = 0; i < 4; i++) {
ee->vm_info.pdp[i] =
- I915_READ(GEN8_RING_PDP_UDW(engine, i));
+ I915_READ(GEN8_RING_PDP_UDW(base, i));
ee->vm_info.pdp[i] <<= 32;
ee->vm_info.pdp[i] |=
- I915_READ(GEN8_RING_PDP_LDW(engine, i));
+ I915_READ(GEN8_RING_PDP_LDW(base, i));
}
+ }
}
}
-static void record_request(struct i915_request *request,
+static void record_request(const struct i915_request *request,
struct drm_i915_error_request *erq)
{
- struct i915_gem_context *ctx = request->gem_context;
+ const struct i915_gem_context *ctx = request->gem_context;
erq->flags = request->fence.flags;
- erq->context = ctx->hw_id;
+ erq->context = request->fence.context;
+ erq->seqno = request->fence.seqno;
erq->sched_attr = request->sched.attr;
- erq->ban_score = atomic_read(&ctx->ban_score);
- erq->seqno = request->global_seqno;
erq->jiffies = request->emitted_jiffies;
erq->start = i915_ggtt_offset(request->ring->vma);
erq->head = request->head;
@@ -1322,12 +1245,12 @@ static void engine_record_requests(struct intel_engine_cs *engine,
count = 0;
request = first;
- list_for_each_entry_from(request, &engine->timeline.requests, link)
+ list_for_each_entry_from(request, &engine->active.requests, sched.link)
count++;
if (!count)
return;
- ee->requests = kcalloc(count, sizeof(*ee->requests), GFP_ATOMIC);
+ ee->requests = kcalloc(count, sizeof(*ee->requests), ATOMIC_MAYFAIL);
if (!ee->requests)
return;
@@ -1335,7 +1258,8 @@ static void engine_record_requests(struct intel_engine_cs *engine,
count = 0;
request = first;
- list_for_each_entry_from(request, &engine->timeline.requests, link) {
+ list_for_each_entry_from(request,
+ &engine->active.requests, sched.link) {
if (count >= ee->num_requests) {
/*
* If the ring request list was changed in
@@ -1360,27 +1284,24 @@ static void engine_record_requests(struct intel_engine_cs *engine,
ee->num_requests = count;
}
-static void error_record_engine_execlists(struct intel_engine_cs *engine,
+static void error_record_engine_execlists(const struct intel_engine_cs *engine,
struct drm_i915_error_engine *ee)
{
const struct intel_engine_execlists * const execlists = &engine->execlists;
- unsigned int n;
+ struct i915_request * const *port = execlists->active;
+ unsigned int n = 0;
- for (n = 0; n < execlists_num_ports(execlists); n++) {
- struct i915_request *rq = port_request(&execlists->port[n]);
-
- if (!rq)
- break;
-
- record_request(rq, &ee->execlist[n]);
- }
+ while (*port)
+ record_request(*port++, &ee->execlist[n++]);
ee->num_ports = n;
}
-static void record_context(struct drm_i915_error_context *e,
- struct i915_gem_context *ctx)
+static bool record_context(struct drm_i915_error_context *e,
+ const struct i915_request *rq)
{
+ const struct i915_gem_context *ctx = rq->gem_context;
+
if (ctx->pid) {
struct task_struct *task;
@@ -1393,17 +1314,49 @@ static void record_context(struct drm_i915_error_context *e,
rcu_read_unlock();
}
- e->handle = ctx->user_handle;
- e->hw_id = ctx->hw_id;
e->sched_attr = ctx->sched;
- e->ban_score = atomic_read(&ctx->ban_score);
- e->bannable = i915_gem_context_is_bannable(ctx);
e->guilty = atomic_read(&ctx->guilty_count);
e->active = atomic_read(&ctx->active_count);
+
+ return i915_gem_context_no_error_capture(ctx);
}
-static void request_record_user_bo(struct i915_request *request,
- struct drm_i915_error_engine *ee)
+struct capture_vma {
+ struct capture_vma *next;
+ void **slot;
+};
+
+static struct capture_vma *
+capture_vma(struct capture_vma *next,
+ struct i915_vma *vma,
+ struct drm_i915_error_object **out)
+{
+ struct capture_vma *c;
+
+ *out = NULL;
+ if (!vma)
+ return next;
+
+ c = kmalloc(sizeof(*c), ATOMIC_MAYFAIL);
+ if (!c)
+ return next;
+
+ if (!i915_active_acquire_if_busy(&vma->active)) {
+ kfree(c);
+ return next;
+ }
+
+ c->slot = (void **)out;
+ *c->slot = i915_vma_get(vma);
+
+ c->next = next;
+ return c;
+}
+
+static struct capture_vma *
+request_record_user_bo(struct i915_request *request,
+ struct drm_i915_error_engine *ee,
+ struct capture_vma *capture)
{
struct i915_capture_list *c;
struct drm_i915_error_object **bo;
@@ -1413,33 +1366,34 @@ static void request_record_user_bo(struct i915_request *request,
for (c = request->capture_list; c; c = c->next)
max++;
if (!max)
- return;
+ return capture;
- bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC);
+ bo = kmalloc_array(max, sizeof(*bo), ATOMIC_MAYFAIL);
if (!bo) {
/* If we can't capture everything, try to capture something. */
max = min_t(long, max, PAGE_SIZE / sizeof(*bo));
- bo = kmalloc_array(max, sizeof(*bo), GFP_ATOMIC);
+ bo = kmalloc_array(max, sizeof(*bo), ATOMIC_MAYFAIL);
}
if (!bo)
- return;
+ return capture;
count = 0;
for (c = request->capture_list; c; c = c->next) {
- bo[count] = i915_error_object_create(request->i915, c->vma);
- if (!bo[count])
- break;
+ capture = capture_vma(capture, c->vma, &bo[count]);
if (++count == max)
break;
}
ee->user_bo = bo;
ee->user_bo_count = count;
+
+ return capture;
}
static struct drm_i915_error_object *
capture_object(struct drm_i915_private *dev_priv,
- struct drm_i915_gem_object *obj)
+ struct drm_i915_gem_object *obj,
+ struct compress *compress)
{
if (obj && i915_gem_object_has_pages(obj)) {
struct i915_vma fake = {
@@ -1449,184 +1403,147 @@ capture_object(struct drm_i915_private *dev_priv,
.obj = obj,
};
- return i915_error_object_create(dev_priv, &fake);
+ return i915_error_object_create(dev_priv, &fake, compress);
} else {
return NULL;
}
}
-static void gem_record_rings(struct i915_gpu_state *error)
+static void
+gem_record_rings(struct i915_gpu_state *error, struct compress *compress)
{
struct drm_i915_private *i915 = error->i915;
- struct i915_ggtt *ggtt = &i915->ggtt;
- int i;
+ struct intel_engine_cs *engine;
+ struct drm_i915_error_engine *ee;
+
+ ee = kzalloc(sizeof(*ee), GFP_KERNEL);
+ if (!ee)
+ return;
- for (i = 0; i < I915_NUM_ENGINES; i++) {
- struct intel_engine_cs *engine = i915->engine[i];
- struct drm_i915_error_engine *ee = &error->engine[i];
+ for_each_uabi_engine(engine, i915) {
+ struct capture_vma *capture = NULL;
struct i915_request *request;
+ unsigned long flags;
- ee->engine_id = -1;
+ /* Refill our page pool before entering atomic section */
+ pool_refill(&compress->pool, ALLOW_FAIL);
- if (!engine)
+ spin_lock_irqsave(&engine->active.lock, flags);
+ request = intel_engine_find_active_request(engine);
+ if (!request) {
+ spin_unlock_irqrestore(&engine->active.lock, flags);
continue;
+ }
- ee->engine_id = i;
+ error->simulated |= record_context(&ee->context, request);
- error_record_engine_registers(error, engine, ee);
- error_record_engine_execlists(engine, ee);
+ /*
+ * We need to copy these to an anonymous buffer
+ * as the simplest method to avoid being overwritten
+ * by userspace.
+ */
+ capture = capture_vma(capture,
+ request->batch,
+ &ee->batchbuffer);
- request = i915_gem_find_active_request(engine);
- if (request) {
- struct i915_gem_context *ctx = request->gem_context;
- struct intel_ring *ring;
+ if (HAS_BROKEN_CS_TLB(i915))
+ capture = capture_vma(capture,
+ engine->gt->scratch,
+ &ee->wa_batchbuffer);
- ee->vm = ctx->ppgtt ? &ctx->ppgtt->vm : &ggtt->vm;
+ capture = request_record_user_bo(request, ee, capture);
- record_context(&ee->context, ctx);
+ capture = capture_vma(capture,
+ request->hw_context->state,
+ &ee->ctx);
- /* We need to copy these to an anonymous buffer
- * as the simplest method to avoid being overwritten
- * by userspace.
- */
- ee->batchbuffer =
- i915_error_object_create(i915, request->batch);
+ capture = capture_vma(capture,
+ request->ring->vma,
+ &ee->ringbuffer);
- if (HAS_BROKEN_CS_TLB(i915))
- ee->wa_batchbuffer =
- i915_error_object_create(i915,
- i915->gt.scratch);
- request_record_user_bo(request, ee);
+ ee->cpu_ring_head = request->ring->head;
+ ee->cpu_ring_tail = request->ring->tail;
- ee->ctx =
- i915_error_object_create(i915,
- request->hw_context->state);
+ ee->rq_head = request->head;
+ ee->rq_post = request->postfix;
+ ee->rq_tail = request->tail;
- error->simulated |=
- i915_gem_context_no_error_capture(ctx);
+ engine_record_requests(engine, request, ee);
+ spin_unlock_irqrestore(&engine->active.lock, flags);
- ee->rq_head = request->head;
- ee->rq_post = request->postfix;
- ee->rq_tail = request->tail;
+ error_record_engine_registers(error, engine, ee);
+ error_record_engine_execlists(engine, ee);
- ring = request->ring;
- ee->cpu_ring_head = ring->head;
- ee->cpu_ring_tail = ring->tail;
- ee->ringbuffer =
- i915_error_object_create(i915, ring->vma);
+ while (capture) {
+ struct capture_vma *this = capture;
+ struct i915_vma *vma = *this->slot;
- engine_record_requests(engine, request, ee);
+ *this->slot =
+ i915_error_object_create(i915, vma, compress);
+
+ i915_active_release(&vma->active);
+ i915_vma_put(vma);
+
+ capture = this->next;
+ kfree(this);
}
ee->hws_page =
i915_error_object_create(i915,
- engine->status_page.vma);
-
- ee->wa_ctx = i915_error_object_create(i915, engine->wa_ctx.vma);
-
- ee->default_state = capture_object(i915, engine->default_state);
- }
-}
-
-static void gem_capture_vm(struct i915_gpu_state *error,
- struct i915_address_space *vm,
- int idx)
-{
- struct drm_i915_error_buffer *active_bo;
- struct i915_vma *vma;
- int count;
+ engine->status_page.vma,
+ compress);
- count = 0;
- list_for_each_entry(vma, &vm->bound_list, vm_link)
- if (i915_vma_is_active(vma))
- count++;
-
- active_bo = NULL;
- if (count)
- active_bo = kcalloc(count, sizeof(*active_bo), GFP_ATOMIC);
- if (active_bo)
- count = capture_error_bo(active_bo,
- count, &vm->bound_list,
- ACTIVE_ONLY);
- else
- count = 0;
-
- error->active_vm[idx] = vm;
- error->active_bo[idx] = active_bo;
- error->active_bo_count[idx] = count;
-}
+ ee->wa_ctx =
+ i915_error_object_create(i915,
+ engine->wa_ctx.vma,
+ compress);
-static void capture_active_buffers(struct i915_gpu_state *error)
-{
- int cnt = 0, i, j;
+ ee->default_state =
+ capture_object(i915, engine->default_state, compress);
- BUILD_BUG_ON(ARRAY_SIZE(error->engine) > ARRAY_SIZE(error->active_bo));
- BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_vm));
- BUILD_BUG_ON(ARRAY_SIZE(error->active_bo) != ARRAY_SIZE(error->active_bo_count));
+ ee->engine = engine;
- /* Scan each engine looking for unique active contexts/vm */
- for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
- struct drm_i915_error_engine *ee = &error->engine[i];
- bool found;
+ ee->next = error->engine;
+ error->engine = ee;
- if (!ee->vm)
- continue;
-
- found = false;
- for (j = 0; j < i && !found; j++)
- found = error->engine[j].vm == ee->vm;
- if (!found)
- gem_capture_vm(error, ee->vm, cnt++);
+ ee = kzalloc(sizeof(*ee), GFP_KERNEL);
+ if (!ee)
+ return;
}
-}
-
-static void capture_pinned_buffers(struct i915_gpu_state *error)
-{
- struct i915_address_space *vm = &error->i915->ggtt.vm;
- struct drm_i915_error_buffer *bo;
- struct i915_vma *vma;
- int count;
-
- count = 0;
- list_for_each_entry(vma, &vm->bound_list, vm_link)
- count++;
-
- bo = NULL;
- if (count)
- bo = kcalloc(count, sizeof(*bo), GFP_ATOMIC);
- if (!bo)
- return;
- error->pinned_bo_count =
- capture_error_bo(bo, count, &vm->bound_list, PINNED_ONLY);
- error->pinned_bo = bo;
+ kfree(ee);
}
-static void capture_uc_state(struct i915_gpu_state *error)
+static void
+capture_uc_state(struct i915_gpu_state *error, struct compress *compress)
{
struct drm_i915_private *i915 = error->i915;
struct i915_error_uc *error_uc = &error->uc;
+ struct intel_uc *uc = &i915->gt.uc;
/* Capturing uC state won't be useful if there is no GuC */
- if (!error->device_info.has_guc)
+ if (!error->device_info.has_gt_uc)
return;
- error_uc->guc_fw = i915->guc.fw;
- error_uc->huc_fw = i915->huc.fw;
+ memcpy(&error_uc->guc_fw, &uc->guc.fw, sizeof(uc->guc.fw));
+ memcpy(&error_uc->huc_fw, &uc->huc.fw, sizeof(uc->huc.fw));
/* Non-default firmware paths will be specified by the modparam.
* As modparams are generally accesible from the userspace make
* explicit copies of the firmware paths.
*/
- error_uc->guc_fw.path = kstrdup(i915->guc.fw.path, GFP_ATOMIC);
- error_uc->huc_fw.path = kstrdup(i915->huc.fw.path, GFP_ATOMIC);
- error_uc->guc_log = i915_error_object_create(i915, i915->guc.log.vma);
+ error_uc->guc_fw.path = kstrdup(uc->guc.fw.path, ALLOW_FAIL);
+ error_uc->huc_fw.path = kstrdup(uc->huc.fw.path, ALLOW_FAIL);
+ error_uc->guc_log = i915_error_object_create(i915,
+ uc->guc.log.vma,
+ compress);
}
/* Capture all registers which don't fit into another category. */
static void capture_reg_state(struct i915_gpu_state *error)
{
- struct drm_i915_private *dev_priv = error->i915;
+ struct drm_i915_private *i915 = error->i915;
+ struct intel_uncore *uncore = &i915->uncore;
int i;
/* General organization
@@ -1638,95 +1555,125 @@ static void capture_reg_state(struct i915_gpu_state *error)
*/
/* 1: Registers specific to a single generation */
- if (IS_VALLEYVIEW(dev_priv)) {
- error->gtier[0] = I915_READ(GTIER);
- error->ier = I915_READ(VLV_IER);
- error->forcewake = I915_READ_FW(FORCEWAKE_VLV);
+ if (IS_VALLEYVIEW(i915)) {
+ error->gtier[0] = intel_uncore_read(uncore, GTIER);
+ error->ier = intel_uncore_read(uncore, VLV_IER);
+ error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_VLV);
}
- if (IS_GEN(dev_priv, 7))
- error->err_int = I915_READ(GEN7_ERR_INT);
-
- if (INTEL_GEN(dev_priv) >= 8) {
- error->fault_data0 = I915_READ(GEN8_FAULT_TLB_DATA0);
- error->fault_data1 = I915_READ(GEN8_FAULT_TLB_DATA1);
+ if (IS_GEN(i915, 7))
+ error->err_int = intel_uncore_read(uncore, GEN7_ERR_INT);
+
+ if (INTEL_GEN(i915) >= 12) {
+ error->fault_data0 = intel_uncore_read(uncore,
+ GEN12_FAULT_TLB_DATA0);
+ error->fault_data1 = intel_uncore_read(uncore,
+ GEN12_FAULT_TLB_DATA1);
+ } else if (INTEL_GEN(i915) >= 8) {
+ error->fault_data0 = intel_uncore_read(uncore,
+ GEN8_FAULT_TLB_DATA0);
+ error->fault_data1 = intel_uncore_read(uncore,
+ GEN8_FAULT_TLB_DATA1);
}
- if (IS_GEN(dev_priv, 6)) {
- error->forcewake = I915_READ_FW(FORCEWAKE);
- error->gab_ctl = I915_READ(GAB_CTL);
- error->gfx_mode = I915_READ(GFX_MODE);
+ if (IS_GEN(i915, 6)) {
+ error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE);
+ error->gab_ctl = intel_uncore_read(uncore, GAB_CTL);
+ error->gfx_mode = intel_uncore_read(uncore, GFX_MODE);
}
/* 2: Registers which belong to multiple generations */
- if (INTEL_GEN(dev_priv) >= 7)
- error->forcewake = I915_READ_FW(FORCEWAKE_MT);
-
- if (INTEL_GEN(dev_priv) >= 6) {
- error->derrmr = I915_READ(DERRMR);
- error->error = I915_READ(ERROR_GEN6);
- error->done_reg = I915_READ(DONE_REG);
+ if (INTEL_GEN(i915) >= 7)
+ error->forcewake = intel_uncore_read_fw(uncore, FORCEWAKE_MT);
+
+ if (INTEL_GEN(i915) >= 6) {
+ error->derrmr = intel_uncore_read(uncore, DERRMR);
+ if (INTEL_GEN(i915) < 12) {
+ error->error = intel_uncore_read(uncore, ERROR_GEN6);
+ error->done_reg = intel_uncore_read(uncore, DONE_REG);
+ }
}
- if (INTEL_GEN(dev_priv) >= 5)
- error->ccid = I915_READ(CCID);
+ if (INTEL_GEN(i915) >= 5)
+ error->ccid = intel_uncore_read(uncore, CCID(RENDER_RING_BASE));
/* 3: Feature specific registers */
- if (IS_GEN_RANGE(dev_priv, 6, 7)) {
- error->gam_ecochk = I915_READ(GAM_ECOCHK);
- error->gac_eco = I915_READ(GAC_ECO_BITS);
+ if (IS_GEN_RANGE(i915, 6, 7)) {
+ error->gam_ecochk = intel_uncore_read(uncore, GAM_ECOCHK);
+ error->gac_eco = intel_uncore_read(uncore, GAC_ECO_BITS);
+ }
+
+ if (IS_GEN_RANGE(i915, 8, 11))
+ error->gtt_cache = intel_uncore_read(uncore, HSW_GTT_CACHE_EN);
+
+ if (IS_GEN(i915, 12))
+ error->aux_err = intel_uncore_read(uncore, GEN12_AUX_ERR_DBG);
+
+ if (INTEL_GEN(i915) >= 12) {
+ for (i = 0; i < GEN12_SFC_DONE_MAX; i++) {
+ error->sfc_done[i] =
+ intel_uncore_read(uncore, GEN12_SFC_DONE(i));
+ }
+
+ error->gam_done = intel_uncore_read(uncore, GEN12_GAM_DONE);
}
/* 4: Everything else */
- if (INTEL_GEN(dev_priv) >= 11) {
- error->ier = I915_READ(GEN8_DE_MISC_IER);
- error->gtier[0] = I915_READ(GEN11_RENDER_COPY_INTR_ENABLE);
- error->gtier[1] = I915_READ(GEN11_VCS_VECS_INTR_ENABLE);
- error->gtier[2] = I915_READ(GEN11_GUC_SG_INTR_ENABLE);
- error->gtier[3] = I915_READ(GEN11_GPM_WGBOXPERF_INTR_ENABLE);
- error->gtier[4] = I915_READ(GEN11_CRYPTO_RSVD_INTR_ENABLE);
- error->gtier[5] = I915_READ(GEN11_GUNIT_CSME_INTR_ENABLE);
+ if (INTEL_GEN(i915) >= 11) {
+ error->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER);
+ error->gtier[0] =
+ intel_uncore_read(uncore,
+ GEN11_RENDER_COPY_INTR_ENABLE);
+ error->gtier[1] =
+ intel_uncore_read(uncore, GEN11_VCS_VECS_INTR_ENABLE);
+ error->gtier[2] =
+ intel_uncore_read(uncore, GEN11_GUC_SG_INTR_ENABLE);
+ error->gtier[3] =
+ intel_uncore_read(uncore,
+ GEN11_GPM_WGBOXPERF_INTR_ENABLE);
+ error->gtier[4] =
+ intel_uncore_read(uncore,
+ GEN11_CRYPTO_RSVD_INTR_ENABLE);
+ error->gtier[5] =
+ intel_uncore_read(uncore,
+ GEN11_GUNIT_CSME_INTR_ENABLE);
error->ngtier = 6;
- } else if (INTEL_GEN(dev_priv) >= 8) {
- error->ier = I915_READ(GEN8_DE_MISC_IER);
+ } else if (INTEL_GEN(i915) >= 8) {
+ error->ier = intel_uncore_read(uncore, GEN8_DE_MISC_IER);
for (i = 0; i < 4; i++)
- error->gtier[i] = I915_READ(GEN8_GT_IER(i));
+ error->gtier[i] = intel_uncore_read(uncore,
+ GEN8_GT_IER(i));
error->ngtier = 4;
- } else if (HAS_PCH_SPLIT(dev_priv)) {
- error->ier = I915_READ(DEIER);
- error->gtier[0] = I915_READ(GTIER);
+ } else if (HAS_PCH_SPLIT(i915)) {
+ error->ier = intel_uncore_read(uncore, DEIER);
+ error->gtier[0] = intel_uncore_read(uncore, GTIER);
error->ngtier = 1;
- } else if (IS_GEN(dev_priv, 2)) {
- error->ier = I915_READ16(IER);
- } else if (!IS_VALLEYVIEW(dev_priv)) {
- error->ier = I915_READ(IER);
+ } else if (IS_GEN(i915, 2)) {
+ error->ier = intel_uncore_read16(uncore, GEN2_IER);
+ } else if (!IS_VALLEYVIEW(i915)) {
+ error->ier = intel_uncore_read(uncore, GEN2_IER);
}
- error->eir = I915_READ(EIR);
- error->pgtbl_er = I915_READ(PGTBL_ER);
+ error->eir = intel_uncore_read(uncore, EIR);
+ error->pgtbl_er = intel_uncore_read(uncore, PGTBL_ER);
}
static const char *
-error_msg(struct i915_gpu_state *error, unsigned long engines, const char *msg)
+error_msg(struct i915_gpu_state *error,
+ intel_engine_mask_t engines, const char *msg)
{
int len;
- int i;
-
- for (i = 0; i < ARRAY_SIZE(error->engine); i++)
- if (!error->engine[i].context.pid)
- engines &= ~BIT(i);
len = scnprintf(error->error_msg, sizeof(error->error_msg),
- "GPU HANG: ecode %d:%lx:0x%08x",
+ "GPU HANG: ecode %d:%x:0x%08x",
INTEL_GEN(error->i915), engines,
- i915_error_generate_code(error, engines));
- if (engines) {
+ i915_error_generate_code(error));
+ if (error->engine) {
/* Just show the first executing process, more is confusing */
- i = __ffs(engines);
len += scnprintf(error->error_msg + len,
sizeof(error->error_msg) - len,
", in %s [%d]",
- error->engine[i].context.comm,
- error->engine[i].context.pid);
+ error->engine->context.comm,
+ error->engine->context.pid);
}
if (msg)
len += scnprintf(error->error_msg + len,
@@ -1765,56 +1712,15 @@ static void capture_params(struct i915_gpu_state *error)
i915_params_copy(&error->params, &i915_modparams);
}
-static unsigned long capture_find_epoch(const struct i915_gpu_state *error)
-{
- unsigned long epoch = error->capture;
- int i;
-
- for (i = 0; i < ARRAY_SIZE(error->engine); i++) {
- const struct drm_i915_error_engine *ee = &error->engine[i];
-
- if (ee->hangcheck_timestamp &&
- time_before(ee->hangcheck_timestamp, epoch))
- epoch = ee->hangcheck_timestamp;
- }
-
- return epoch;
-}
-
static void capture_finish(struct i915_gpu_state *error)
{
struct i915_ggtt *ggtt = &error->i915->ggtt;
- const u64 slot = ggtt->error_capture.start;
-
- ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE);
-}
-
-static int capture(void *data)
-{
- struct i915_gpu_state *error = data;
-
- error->time = ktime_get_real();
- error->boottime = ktime_get_boottime();
- error->uptime = ktime_sub(ktime_get(),
- error->i915->gt.last_init_time);
- error->capture = jiffies;
- capture_params(error);
- capture_gen_state(error);
- capture_uc_state(error);
- capture_reg_state(error);
- gem_record_fences(error);
- gem_record_rings(error);
- capture_active_buffers(error);
- capture_pinned_buffers(error);
-
- error->overlay = intel_overlay_capture_error_state(error->i915);
- error->display = intel_display_capture_error_state(error->i915);
+ if (drm_mm_node_allocated(&ggtt->error_capture)) {
+ const u64 slot = ggtt->error_capture.start;
- error->epoch = capture_find_epoch(error);
-
- capture_finish(error);
- return 0;
+ ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE);
+ }
}
#define DAY_AS_SECONDS(x) (24 * 60 * 60 * (x))
@@ -1823,22 +1729,45 @@ struct i915_gpu_state *
i915_capture_gpu_state(struct drm_i915_private *i915)
{
struct i915_gpu_state *error;
+ struct compress compress;
/* Check if GPU capture has been disabled */
error = READ_ONCE(i915->gpu_error.first_error);
if (IS_ERR(error))
return error;
- error = kzalloc(sizeof(*error), GFP_ATOMIC);
+ error = kzalloc(sizeof(*error), ALLOW_FAIL);
if (!error) {
i915_disable_error_state(i915, -ENOMEM);
return ERR_PTR(-ENOMEM);
}
+ if (!compress_init(&compress)) {
+ kfree(error);
+ i915_disable_error_state(i915, -ENOMEM);
+ return ERR_PTR(-ENOMEM);
+ }
+
kref_init(&error->ref);
error->i915 = i915;
- stop_machine(capture, error, NULL);
+ error->time = ktime_get_real();
+ error->boottime = ktime_get_boottime();
+ error->uptime = ktime_sub(ktime_get(), i915->gt.last_init_time);
+ error->capture = jiffies;
+
+ capture_params(error);
+ capture_gen_state(error);
+ capture_uc_state(error, &compress);
+ capture_reg_state(error);
+ gem_record_fences(error);
+ gem_record_rings(error, &compress);
+
+ error->overlay = intel_overlay_capture_error_state(i915);
+ error->display = intel_display_capture_error_state(i915);
+
+ capture_finish(error);
+ compress_fini(&compress);
return error;
}
@@ -1855,7 +1784,7 @@ i915_capture_gpu_state(struct drm_i915_private *i915)
* to pick up.
*/
void i915_capture_error_state(struct drm_i915_private *i915,
- unsigned long engine_mask,
+ intel_engine_mask_t engine_mask,
const char *msg)
{
static bool warned;
@@ -1888,15 +1817,14 @@ void i915_capture_error_state(struct drm_i915_private *i915,
return;
}
- if (!warned &&
+ if (!xchg(&warned, true) &&
ktime_get_real_seconds() - DRIVER_TIMESTAMP < DAY_AS_SECONDS(180)) {
- DRM_INFO("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n");
- DRM_INFO("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n");
- DRM_INFO("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n");
- DRM_INFO("The gpu crash dump is required to analyze gpu hangs, so please always attach it.\n");
- DRM_INFO("GPU crash dump saved to /sys/class/drm/card%d/error\n",
- i915->drm.primary->index);
- warned = true;
+ pr_info("GPU hangs can indicate a bug anywhere in the entire gfx stack, including userspace.\n");
+ pr_info("Please file a _new_ bug report on bugs.freedesktop.org against DRI -> DRM/Intel\n");
+ pr_info("drm/i915 developers can then reassign to the right component if it's not a kernel issue.\n");
+ pr_info("The GPU crash dump is required to analyze GPU hangs, so please always attach it.\n");
+ pr_info("GPU crash dump saved to /sys/class/drm/card%d/error\n",
+ i915->drm.primary->index);
}
}