aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/i915/i915_gem_execbuffer.c
diff options
context:
space:
mode:
authorChris Wilson <chris@chris-wilson.co.uk>2017-06-16 15:05:16 +0100
committerChris Wilson <chris@chris-wilson.co.uk>2017-06-16 16:54:04 +0100
commit4ff4b44cbb70c269259958cbcc48d7b8a2cb9ec8 (patch)
tree19b5eda36527b1f71caa77ada46083263fc01e69 /drivers/gpu/drm/i915/i915_gem_execbuffer.c
parentdrm/i915: Fix retrieval of hangcheck stats (diff)
downloadlinux-dev-4ff4b44cbb70c269259958cbcc48d7b8a2cb9ec8.tar.xz
linux-dev-4ff4b44cbb70c269259958cbcc48d7b8a2cb9ec8.zip
drm/i915: Store a direct lookup from object handle to vma
The advent of full-ppgtt lead to an extra indirection between the object and its binding. That extra indirection has a noticeable impact on how fast we can convert from the user handles to our internal vma for execbuffer. In order to bypass the extra indirection, we use a resizable hashtable to jump from the object to the per-ctx vma. rhashtable was considered but we don't need the online resizing feature and the extra complexity proved to undermine its usefulness. Instead, we simply reallocate the hastable on demand in a background task and serialize it before iterating. In non-full-ppgtt modes, multiple files and multiple contexts can share the same vma. This leads to having multiple possible handle->vma links, so we only use the first to establish the fast path. The majority of buffers are not shared and so we should still be able to realise speedups with multiple clients. v2: Prettier names, more magic. v3: Many style tweaks, most notably hiding the misuse of execobj[].rsvd2 Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Diffstat (limited to 'drivers/gpu/drm/i915/i915_gem_execbuffer.c')
-rw-r--r--drivers/gpu/drm/i915/i915_gem_execbuffer.c264
1 files changed, 160 insertions, 104 deletions
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index d6099d084748..b06f561a268f 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -75,37 +75,42 @@ struct i915_execbuffer {
unsigned int page;
bool use_64bit_reloc : 1;
} reloc_cache;
- int and;
- union {
- struct i915_vma **lut;
- struct hlist_head *buckets;
- };
+ int lut_mask;
+ struct hlist_head *buckets;
};
+/*
+ * As an alternative to creating a hashtable of handle-to-vma for a batch,
+ * we used the last available reserved field in the execobject[] and stash
+ * a link from the execobj to its vma.
+ */
+#define __exec_to_vma(ee) (ee)->rsvd2
+#define exec_to_vma(ee) u64_to_ptr(struct i915_vma, __exec_to_vma(ee))
+
static int eb_create(struct i915_execbuffer *eb)
{
- eb->lut = NULL;
- if (eb->args->flags & I915_EXEC_HANDLE_LUT) {
- unsigned int size = eb->args->buffer_count;
- size *= sizeof(struct i915_vma *);
- eb->lut = kmalloc(size,
- GFP_TEMPORARY | __GFP_NOWARN | __GFP_NORETRY);
- }
-
- if (!eb->lut) {
- unsigned int size = eb->args->buffer_count;
- unsigned int count = PAGE_SIZE / sizeof(struct hlist_head) / 2;
- BUILD_BUG_ON_NOT_POWER_OF_2(PAGE_SIZE / sizeof(struct hlist_head));
- while (count > 2*size)
- count >>= 1;
- eb->lut = kzalloc(count * sizeof(struct hlist_head),
- GFP_TEMPORARY);
- if (!eb->lut)
- return -ENOMEM;
-
- eb->and = count - 1;
+ if ((eb->args->flags & I915_EXEC_HANDLE_LUT) == 0) {
+ unsigned int size = 1 + ilog2(eb->args->buffer_count);
+
+ do {
+ eb->buckets = kzalloc(sizeof(struct hlist_head) << size,
+ GFP_TEMPORARY |
+ __GFP_NORETRY |
+ __GFP_NOWARN);
+ if (eb->buckets)
+ break;
+ } while (--size);
+
+ if (unlikely(!eb->buckets)) {
+ eb->buckets = kzalloc(sizeof(struct hlist_head),
+ GFP_TEMPORARY);
+ if (unlikely(!eb->buckets))
+ return -ENOMEM;
+ }
+
+ eb->lut_mask = size;
} else {
- eb->and = -eb->args->buffer_count;
+ eb->lut_mask = -eb->args->buffer_count;
}
return 0;
@@ -142,73 +147,112 @@ eb_reset(struct i915_execbuffer *eb)
vma->exec_entry = NULL;
}
- if (eb->and >= 0)
- memset(eb->buckets, 0, (eb->and+1)*sizeof(struct hlist_head));
+ if (eb->lut_mask >= 0)
+ memset(eb->buckets, 0,
+ sizeof(struct hlist_head) << eb->lut_mask);
}
-static struct i915_vma *
-eb_get_batch(struct i915_execbuffer *eb)
+static bool
+eb_add_vma(struct i915_execbuffer *eb, struct i915_vma *vma, int i)
{
- struct i915_vma *vma = list_entry(eb->vmas.prev, typeof(*vma), exec_link);
+ if (unlikely(vma->exec_entry)) {
+ DRM_DEBUG("Object [handle %d, index %d] appears more than once in object list\n",
+ eb->exec[i].handle, i);
+ return false;
+ }
+ list_add_tail(&vma->exec_link, &eb->vmas);
- /*
- * SNA is doing fancy tricks with compressing batch buffers, which leads
- * to negative relocation deltas. Usually that works out ok since the
- * relocate address is still positive, except when the batch is placed
- * very low in the GTT. Ensure this doesn't happen.
- *
- * Note that actual hangs have only been observed on gen7, but for
- * paranoia do it everywhere.
- */
- if ((vma->exec_entry->flags & EXEC_OBJECT_PINNED) == 0)
- vma->exec_entry->flags |= __EXEC_OBJECT_NEEDS_BIAS;
+ vma->exec_entry = &eb->exec[i];
+ if (eb->lut_mask >= 0) {
+ vma->exec_handle = eb->exec[i].handle;
+ hlist_add_head(&vma->exec_node,
+ &eb->buckets[hash_32(vma->exec_handle,
+ eb->lut_mask)]);
+ }
- return vma;
+ i915_vma_get(vma);
+ __exec_to_vma(&eb->exec[i]) = (uintptr_t)vma;
+ return true;
+}
+
+static inline struct hlist_head *
+ht_head(const struct i915_gem_context *ctx, u32 handle)
+{
+ return &ctx->vma_lut.ht[hash_32(handle, ctx->vma_lut.ht_bits)];
+}
+
+static inline bool
+ht_needs_resize(const struct i915_gem_context *ctx)
+{
+ return (4*ctx->vma_lut.ht_count > 3*ctx->vma_lut.ht_size ||
+ 4*ctx->vma_lut.ht_count + 1 < ctx->vma_lut.ht_size);
}
static int
eb_lookup_vmas(struct i915_execbuffer *eb)
{
- struct drm_i915_gem_object *obj;
- struct list_head objects;
- int i, ret;
+#define INTERMEDIATE BIT(0)
+ const int count = eb->args->buffer_count;
+ struct i915_vma *vma;
+ int slow_pass = -1;
+ int i;
INIT_LIST_HEAD(&eb->vmas);
- INIT_LIST_HEAD(&objects);
+ if (unlikely(eb->ctx->vma_lut.ht_size & I915_CTX_RESIZE_IN_PROGRESS))
+ flush_work(&eb->ctx->vma_lut.resize);
+ GEM_BUG_ON(eb->ctx->vma_lut.ht_size & I915_CTX_RESIZE_IN_PROGRESS);
+
+ for (i = 0; i < count; i++) {
+ __exec_to_vma(&eb->exec[i]) = 0;
+
+ hlist_for_each_entry(vma,
+ ht_head(eb->ctx, eb->exec[i].handle),
+ ctx_node) {
+ if (vma->ctx_handle != eb->exec[i].handle)
+ continue;
+
+ if (!eb_add_vma(eb, vma, i))
+ return -EINVAL;
+
+ goto next_vma;
+ }
+
+ if (slow_pass < 0)
+ slow_pass = i;
+next_vma: ;
+ }
+
+ if (slow_pass < 0)
+ return 0;
+
spin_lock(&eb->file->table_lock);
/* Grab a reference to the object and release the lock so we can lookup
* or create the VMA without using GFP_ATOMIC */
- for (i = 0; i < eb->args->buffer_count; i++) {
- obj = to_intel_bo(idr_find(&eb->file->object_idr, eb->exec[i].handle));
- if (obj == NULL) {
- spin_unlock(&eb->file->table_lock);
- DRM_DEBUG("Invalid object handle %d at index %d\n",
- eb->exec[i].handle, i);
- ret = -ENOENT;
- goto err;
- }
+ for (i = slow_pass; i < count; i++) {
+ struct drm_i915_gem_object *obj;
- if (!list_empty(&obj->obj_exec_link)) {
+ if (__exec_to_vma(&eb->exec[i]))
+ continue;
+
+ obj = to_intel_bo(idr_find(&eb->file->object_idr,
+ eb->exec[i].handle));
+ if (unlikely(!obj)) {
spin_unlock(&eb->file->table_lock);
- DRM_DEBUG("Object %p [handle %d, index %d] appears more than once in object list\n",
- obj, eb->exec[i].handle, i);
- ret = -EINVAL;
- goto err;
+ DRM_DEBUG("Invalid object handle %d at index %d\n",
+ eb->exec[i].handle, i);
+ return -ENOENT;
}
- i915_gem_object_get(obj);
- list_add_tail(&obj->obj_exec_link, &objects);
+ __exec_to_vma(&eb->exec[i]) = INTERMEDIATE | (uintptr_t)obj;
}
spin_unlock(&eb->file->table_lock);
- i = 0;
- while (!list_empty(&objects)) {
- struct i915_vma *vma;
+ for (i = slow_pass; i < count; i++) {
+ struct drm_i915_gem_object *obj;
- obj = list_first_entry(&objects,
- struct drm_i915_gem_object,
- obj_exec_link);
+ if ((__exec_to_vma(&eb->exec[i]) & INTERMEDIATE) == 0)
+ continue;
/*
* NOTE: We can leak any vmas created here when something fails
@@ -218,61 +262,73 @@ eb_lookup_vmas(struct i915_execbuffer *eb)
* from the (obj, vm) we don't run the risk of creating
* duplicated vmas for the same vm.
*/
+ obj = u64_to_ptr(struct drm_i915_gem_object,
+ __exec_to_vma(&eb->exec[i]) & ~INTERMEDIATE);
vma = i915_vma_instance(obj, eb->vm, NULL);
if (unlikely(IS_ERR(vma))) {
DRM_DEBUG("Failed to lookup VMA\n");
- ret = PTR_ERR(vma);
- goto err;
+ return PTR_ERR(vma);
}
- /* Transfer ownership from the objects list to the vmas list. */
- list_add_tail(&vma->exec_link, &eb->vmas);
- list_del_init(&obj->obj_exec_link);
-
- vma->exec_entry = &eb->exec[i];
- if (eb->and < 0) {
- eb->lut[i] = vma;
- } else {
- u32 handle =
- eb->args->flags & I915_EXEC_HANDLE_LUT ?
- i : eb->exec[i].handle;
- vma->exec_handle = handle;
- hlist_add_head(&vma->exec_node,
- &eb->buckets[handle & eb->and]);
+ /* First come, first served */
+ if (!vma->ctx) {
+ vma->ctx = eb->ctx;
+ vma->ctx_handle = eb->exec[i].handle;
+ hlist_add_head(&vma->ctx_node,
+ ht_head(eb->ctx, eb->exec[i].handle));
+ eb->ctx->vma_lut.ht_count++;
+ if (i915_vma_is_ggtt(vma)) {
+ GEM_BUG_ON(obj->vma_hashed);
+ obj->vma_hashed = vma;
+ }
}
- ++i;
+
+ if (!eb_add_vma(eb, vma, i))
+ return -EINVAL;
+ }
+
+ if (ht_needs_resize(eb->ctx)) {
+ eb->ctx->vma_lut.ht_size |= I915_CTX_RESIZE_IN_PROGRESS;
+ queue_work(system_highpri_wq, &eb->ctx->vma_lut.resize);
}
return 0;
+#undef INTERMEDIATE
+}
+static struct i915_vma *
+eb_get_batch(struct i915_execbuffer *eb)
+{
+ struct i915_vma *vma =
+ exec_to_vma(&eb->exec[eb->args->buffer_count - 1]);
-err:
- while (!list_empty(&objects)) {
- obj = list_first_entry(&objects,
- struct drm_i915_gem_object,
- obj_exec_link);
- list_del_init(&obj->obj_exec_link);
- i915_gem_object_put(obj);
- }
/*
- * Objects already transfered to the vmas list will be unreferenced by
- * eb_destroy.
+ * SNA is doing fancy tricks with compressing batch buffers, which leads
+ * to negative relocation deltas. Usually that works out ok since the
+ * relocate address is still positive, except when the batch is placed
+ * very low in the GTT. Ensure this doesn't happen.
+ *
+ * Note that actual hangs have only been observed on gen7, but for
+ * paranoia do it everywhere.
*/
+ if ((vma->exec_entry->flags & EXEC_OBJECT_PINNED) == 0)
+ vma->exec_entry->flags |= __EXEC_OBJECT_NEEDS_BIAS;
- return ret;
+ return vma;
}
-static struct i915_vma *eb_get_vma(struct i915_execbuffer *eb, unsigned long handle)
+static struct i915_vma *
+eb_get_vma(struct i915_execbuffer *eb, unsigned long handle)
{
- if (eb->and < 0) {
- if (handle >= -eb->and)
+ if (eb->lut_mask < 0) {
+ if (handle >= -eb->lut_mask)
return NULL;
- return eb->lut[handle];
+ return exec_to_vma(&eb->exec[handle]);
} else {
struct hlist_head *head;
struct i915_vma *vma;
- head = &eb->buckets[handle & eb->and];
+ head = &eb->buckets[hash_32(handle, eb->lut_mask)];
hlist_for_each_entry(vma, head, exec_node) {
if (vma->exec_handle == handle)
return vma;
@@ -296,7 +352,7 @@ static void eb_destroy(struct i915_execbuffer *eb)
i915_gem_context_put(eb->ctx);
- if (eb->buckets)
+ if (eb->lut_mask >= 0)
kfree(eb->buckets);
}
@@ -916,7 +972,7 @@ static int eb_reserve(struct i915_execbuffer *eb)
need_fence =
(entry->flags & EXEC_OBJECT_NEEDS_FENCE ||
needs_unfenced_map) &&
- i915_gem_object_is_tiled(obj);
+ i915_gem_object_is_tiled(vma->obj);
need_mappable = need_fence || need_reloc_mappable(vma);
if (entry->flags & EXEC_OBJECT_PINNED)