aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/gpu/drm/i915/i915_request.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/i915/i915_request.c')
-rw-r--r--drivers/gpu/drm/i915/i915_request.c300
1 files changed, 152 insertions, 148 deletions
diff --git a/drivers/gpu/drm/i915/i915_request.c b/drivers/gpu/drm/i915/i915_request.c
index a53777dd371c..bbd71af00a91 100644
--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -31,6 +31,8 @@
#include "gem/i915_gem_context.h"
#include "gt/intel_context.h"
+#include "gt/intel_ring.h"
+#include "gt/intel_rps.h"
#include "i915_active.h"
#include "i915_drv.h"
@@ -169,16 +171,17 @@ remove_from_client(struct i915_request *request)
{
struct drm_i915_file_private *file_priv;
- file_priv = READ_ONCE(request->file_priv);
- if (!file_priv)
+ if (!READ_ONCE(request->file_priv))
return;
- spin_lock(&file_priv->mm.lock);
- if (request->file_priv) {
+ rcu_read_lock();
+ file_priv = xchg(&request->file_priv, NULL);
+ if (file_priv) {
+ spin_lock(&file_priv->mm.lock);
list_del(&request->client_link);
- request->file_priv = NULL;
+ spin_unlock(&file_priv->mm.lock);
}
- spin_unlock(&file_priv->mm.lock);
+ rcu_read_unlock();
}
static void free_capture_list(struct i915_request *request)
@@ -194,11 +197,29 @@ static void free_capture_list(struct i915_request *request)
}
}
-static bool i915_request_retire(struct i915_request *rq)
+static void remove_from_engine(struct i915_request *rq)
{
- struct i915_active_request *active, *next;
+ struct intel_engine_cs *engine, *locked;
- lockdep_assert_held(&rq->timeline->mutex);
+ /*
+ * Virtual engines complicate acquiring the engine timeline lock,
+ * as their rq->engine pointer is not stable until under that
+ * engine lock. The simple ploy we use is to take the lock then
+ * check that the rq still belongs to the newly locked engine.
+ */
+ locked = READ_ONCE(rq->engine);
+ spin_lock_irq(&locked->active.lock);
+ while (unlikely(locked != (engine = READ_ONCE(rq->engine)))) {
+ spin_unlock(&locked->active.lock);
+ spin_lock(&engine->active.lock);
+ locked = engine;
+ }
+ list_del(&rq->sched.link);
+ spin_unlock_irq(&locked->active.lock);
+}
+
+bool i915_request_retire(struct i915_request *rq)
+{
if (!i915_request_completed(rq))
return false;
@@ -219,68 +240,34 @@ static bool i915_request_retire(struct i915_request *rq)
* Note this requires that we are always called in request
* completion order.
*/
- GEM_BUG_ON(!list_is_first(&rq->link, &rq->timeline->requests));
+ GEM_BUG_ON(!list_is_first(&rq->link,
+ &i915_request_timeline(rq)->requests));
rq->ring->head = rq->postfix;
/*
- * Walk through the active list, calling retire on each. This allows
- * objects to track their GPU activity and mark themselves as idle
- * when their *last* active request is completed (updating state
- * tracking lists for eviction, active references for GEM, etc).
- *
- * As the ->retire() may free the node, we decouple it first and
- * pass along the auxiliary information (to avoid dereferencing
- * the node after the callback).
- */
- list_for_each_entry_safe(active, next, &rq->active_list, link) {
- /*
- * In microbenchmarks or focusing upon time inside the kernel,
- * we may spend an inordinate amount of time simply handling
- * the retirement of requests and processing their callbacks.
- * Of which, this loop itself is particularly hot due to the
- * cache misses when jumping around the list of
- * i915_active_request. So we try to keep this loop as
- * streamlined as possible and also prefetch the next
- * i915_active_request to try and hide the likely cache miss.
- */
- prefetchw(next);
-
- INIT_LIST_HEAD(&active->link);
- RCU_INIT_POINTER(active->request, NULL);
-
- active->retire(active, rq);
- }
-
- local_irq_disable();
-
- /*
* We only loosely track inflight requests across preemption,
* and so we may find ourselves attempting to retire a _completed_
* request that we have removed from the HW and put back on a run
* queue.
*/
- spin_lock(&rq->engine->active.lock);
- list_del(&rq->sched.link);
- spin_unlock(&rq->engine->active.lock);
+ remove_from_engine(rq);
- spin_lock(&rq->lock);
+ spin_lock_irq(&rq->lock);
i915_request_mark_complete(rq);
if (!i915_request_signaled(rq))
dma_fence_signal_locked(&rq->fence);
if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &rq->fence.flags))
i915_request_cancel_breadcrumb(rq);
if (i915_request_has_waitboost(rq)) {
- GEM_BUG_ON(!atomic_read(&rq->i915->gt_pm.rps.num_waiters));
- atomic_dec(&rq->i915->gt_pm.rps.num_waiters);
+ GEM_BUG_ON(!atomic_read(&rq->engine->gt->rps.num_waiters));
+ atomic_dec(&rq->engine->gt->rps.num_waiters);
}
if (!test_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags)) {
set_bit(I915_FENCE_FLAG_ACTIVE, &rq->fence.flags);
__notify_execute_cb(rq);
}
GEM_BUG_ON(!list_empty(&rq->execute_cb));
- spin_unlock(&rq->lock);
-
- local_irq_enable();
+ spin_unlock_irq(&rq->lock);
remove_from_client(rq);
list_del(&rq->link);
@@ -297,7 +284,7 @@ static bool i915_request_retire(struct i915_request *rq)
void i915_request_retire_upto(struct i915_request *rq)
{
- struct intel_timeline * const tl = rq->timeline;
+ struct intel_timeline * const tl = i915_request_timeline(rq);
struct i915_request *tmp;
GEM_TRACE("%s fence %llx:%lld, current %d\n",
@@ -305,7 +292,6 @@ void i915_request_retire_upto(struct i915_request *rq)
rq->fence.context, rq->fence.seqno,
hwsp_seqno(rq));
- lockdep_assert_held(&tl->mutex);
GEM_BUG_ON(!i915_request_completed(rq));
do {
@@ -358,9 +344,10 @@ __i915_request_await_execution(struct i915_request *rq,
return 0;
}
-void __i915_request_submit(struct i915_request *request)
+bool __i915_request_submit(struct i915_request *request)
{
struct intel_engine_cs *engine = request->engine;
+ bool result = false;
GEM_TRACE("%s fence %llx:%lld, current %d\n",
engine->name,
@@ -370,6 +357,25 @@ void __i915_request_submit(struct i915_request *request)
GEM_BUG_ON(!irqs_disabled());
lockdep_assert_held(&engine->active.lock);
+ /*
+ * With the advent of preempt-to-busy, we frequently encounter
+ * requests that we have unsubmitted from HW, but left running
+ * until the next ack and so have completed in the meantime. On
+ * resubmission of that completed request, we can skip
+ * updating the payload, and execlists can even skip submitting
+ * the request.
+ *
+ * We must remove the request from the caller's priority queue,
+ * and the caller must only call us when the request is in their
+ * priority queue, under the active.lock. This ensures that the
+ * request has *not* yet been retired and we can safely move
+ * the request into the engine->active.list where it will be
+ * dropped upon retiring. (Otherwise if resubmit a *retired*
+ * request, this would be a horrible use-after-free.)
+ */
+ if (i915_request_completed(request))
+ goto xfer;
+
if (i915_gem_context_is_banned(request->gem_context))
i915_request_skip(request, -EIO);
@@ -393,13 +399,18 @@ void __i915_request_submit(struct i915_request *request)
i915_sw_fence_signaled(&request->semaphore))
engine->saturated |= request->sched.semaphores;
- /* We may be recursing from the signal callback of another i915 fence */
- spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
+ engine->emit_fini_breadcrumb(request,
+ request->ring->vaddr + request->postfix);
- list_move_tail(&request->sched.link, &engine->active.requests);
+ trace_i915_request_execute(request);
+ engine->serial++;
+ result = true;
- GEM_BUG_ON(test_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags));
- set_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags);
+xfer: /* We may be recursing from the signal callback of another i915 fence */
+ spin_lock_nested(&request->lock, SINGLE_DEPTH_NESTING);
+
+ if (!test_and_set_bit(I915_FENCE_FLAG_ACTIVE, &request->fence.flags))
+ list_move_tail(&request->sched.link, &engine->active.requests);
if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &request->fence.flags) &&
!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &request->fence.flags) &&
@@ -410,12 +421,7 @@ void __i915_request_submit(struct i915_request *request)
spin_unlock(&request->lock);
- engine->emit_fini_breadcrumb(request,
- request->ring->vaddr + request->postfix);
-
- engine->serial++;
-
- trace_i915_request_execute(request);
+ return result;
}
void i915_request_submit(struct i915_request *request)
@@ -641,9 +647,12 @@ __i915_request_create(struct intel_context *ce, gfp_t gfp)
rq->gem_context = ce->gem_context;
rq->engine = ce->engine;
rq->ring = ce->ring;
- rq->timeline = tl;
+ rq->execution_mask = ce->engine->mask;
+
+ rcu_assign_pointer(rq->timeline, tl);
rq->hwsp_seqno = tl->hwsp_seqno;
rq->hwsp_cacheline = tl->hwsp_cacheline;
+
rq->rcustate = get_state_synchronize_rcu(); /* acts as smp_mb() */
spin_lock_init(&rq->lock);
@@ -661,9 +670,7 @@ __i915_request_create(struct intel_context *ce, gfp_t gfp)
rq->batch = NULL;
rq->capture_list = NULL;
rq->flags = 0;
- rq->execution_mask = ALL_ENGINES;
- INIT_LIST_HEAD(&rq->active_list);
INIT_LIST_HEAD(&rq->execute_cb);
/*
@@ -702,7 +709,6 @@ err_unwind:
ce->ring->emit = rq->head;
/* Make sure we didn't add ourselves to external state before freeing */
- GEM_BUG_ON(!list_empty(&rq->active_list));
GEM_BUG_ON(!list_empty(&rq->sched.signalers_list));
GEM_BUG_ON(!list_empty(&rq->sched.waiters_list));
@@ -747,16 +753,43 @@ err_unlock:
static int
i915_request_await_start(struct i915_request *rq, struct i915_request *signal)
{
- if (list_is_first(&signal->link, &signal->timeline->requests))
- return 0;
+ struct intel_timeline *tl;
+ struct dma_fence *fence;
+ int err;
+
+ GEM_BUG_ON(i915_request_timeline(rq) ==
+ rcu_access_pointer(signal->timeline));
- signal = list_prev_entry(signal, link);
- if (intel_timeline_sync_is_later(rq->timeline, &signal->fence))
+ rcu_read_lock();
+ tl = rcu_dereference(signal->timeline);
+ if (i915_request_started(signal) || !kref_get_unless_zero(&tl->kref))
+ tl = NULL;
+ rcu_read_unlock();
+ if (!tl) /* already started or maybe even completed */
return 0;
- return i915_sw_fence_await_dma_fence(&rq->submit,
- &signal->fence, 0,
- I915_FENCE_GFP);
+ fence = ERR_PTR(-EBUSY);
+ if (mutex_trylock(&tl->mutex)) {
+ fence = NULL;
+ if (!i915_request_started(signal) &&
+ !list_is_first(&signal->link, &tl->requests)) {
+ signal = list_prev_entry(signal, link);
+ fence = dma_fence_get(&signal->fence);
+ }
+ mutex_unlock(&tl->mutex);
+ }
+ intel_timeline_put(tl);
+ if (IS_ERR_OR_NULL(fence))
+ return PTR_ERR_OR_ZERO(fence);
+
+ err = 0;
+ if (intel_timeline_sync_is_later(i915_request_timeline(rq), fence))
+ err = i915_sw_fence_await_dma_fence(&rq->submit,
+ fence, 0,
+ I915_FENCE_GFP);
+ dma_fence_put(fence);
+
+ return err;
}
static intel_engine_mask_t
@@ -782,34 +815,33 @@ emit_semaphore_wait(struct i915_request *to,
struct i915_request *from,
gfp_t gfp)
{
+ const int has_token = INTEL_GEN(to->i915) >= 12;
u32 hwsp_offset;
+ int len;
u32 *cs;
- int err;
- GEM_BUG_ON(!from->timeline->has_initial_breadcrumb);
GEM_BUG_ON(INTEL_GEN(to->i915) < 8);
/* Just emit the first semaphore we see as request space is limited. */
if (already_busywaiting(to) & from->engine->mask)
- return i915_sw_fence_await_dma_fence(&to->submit,
- &from->fence, 0,
- I915_FENCE_GFP);
+ goto await_fence;
- err = i915_request_await_start(to, from);
- if (err < 0)
- return err;
+ if (i915_request_await_start(to, from) < 0)
+ goto await_fence;
/* Only submit our spinner after the signaler is running! */
- err = __i915_request_await_execution(to, from, NULL, gfp);
- if (err)
- return err;
+ if (__i915_request_await_execution(to, from, NULL, gfp))
+ goto await_fence;
/* We need to pin the signaler's HWSP until we are finished reading. */
- err = intel_timeline_read_hwsp(from, to, &hwsp_offset);
- if (err)
- return err;
+ if (intel_timeline_read_hwsp(from, to, &hwsp_offset))
+ goto await_fence;
- cs = intel_ring_begin(to, 4);
+ len = 4;
+ if (has_token)
+ len += 2;
+
+ cs = intel_ring_begin(to, len);
if (IS_ERR(cs))
return PTR_ERR(cs);
@@ -821,18 +853,28 @@ emit_semaphore_wait(struct i915_request *to,
* (post-wrap) values than they were expecting (and so wait
* forever).
*/
- *cs++ = MI_SEMAPHORE_WAIT |
- MI_SEMAPHORE_GLOBAL_GTT |
- MI_SEMAPHORE_POLL |
- MI_SEMAPHORE_SAD_GTE_SDD;
+ *cs++ = (MI_SEMAPHORE_WAIT |
+ MI_SEMAPHORE_GLOBAL_GTT |
+ MI_SEMAPHORE_POLL |
+ MI_SEMAPHORE_SAD_GTE_SDD) +
+ has_token;
*cs++ = from->fence.seqno;
*cs++ = hwsp_offset;
*cs++ = 0;
+ if (has_token) {
+ *cs++ = 0;
+ *cs++ = MI_NOOP;
+ }
intel_ring_advance(to, cs);
to->sched.semaphores |= from->engine->mask;
to->sched.flags |= I915_SCHED_HAS_SEMAPHORE_CHAIN;
return 0;
+
+await_fence:
+ return i915_sw_fence_await_dma_fence(&to->submit,
+ &from->fence, 0,
+ I915_FENCE_GFP);
}
static int
@@ -916,21 +958,23 @@ i915_request_await_dma_fence(struct i915_request *rq, struct dma_fence *fence)
/* Squash repeated waits to the same timelines */
if (fence->context &&
- intel_timeline_sync_is_later(rq->timeline, fence))
+ intel_timeline_sync_is_later(i915_request_timeline(rq),
+ fence))
continue;
if (dma_fence_is_i915(fence))
ret = i915_request_await_request(rq, to_request(fence));
else
ret = i915_sw_fence_await_dma_fence(&rq->submit, fence,
- I915_FENCE_TIMEOUT,
+ fence->context ? I915_FENCE_TIMEOUT : 0,
I915_FENCE_GFP);
if (ret < 0)
return ret;
/* Record the latest fence used against each timeline */
if (fence->context)
- intel_timeline_sync_set(rq->timeline, fence);
+ intel_timeline_sync_set(i915_request_timeline(rq),
+ fence);
} while (--nchild);
return 0;
@@ -1072,7 +1116,7 @@ void i915_request_skip(struct i915_request *rq, int error)
static struct i915_request *
__i915_request_add_to_timeline(struct i915_request *rq)
{
- struct intel_timeline *timeline = rq->timeline;
+ struct intel_timeline *timeline = i915_request_timeline(rq);
struct i915_request *prev;
/*
@@ -1095,8 +1139,8 @@ __i915_request_add_to_timeline(struct i915_request *rq)
* precludes optimising to use semaphores serialisation of a single
* timeline across engines.
*/
- prev = rcu_dereference_protected(timeline->last_request.request,
- lockdep_is_held(&timeline->mutex));
+ prev = to_request(__i915_active_fence_set(&timeline->last_request,
+ &rq->fence));
if (prev && !i915_request_completed(prev)) {
if (is_power_of_2(prev->engine->mask | rq->engine->mask))
i915_sw_fence_await_sw_fence(&rq->submit,
@@ -1121,7 +1165,6 @@ __i915_request_add_to_timeline(struct i915_request *rq)
* us, the timeline will hold its seqno which is later than ours.
*/
GEM_BUG_ON(timeline->seqno != rq->fence.seqno);
- __i915_active_request_set(&timeline->last_request, rq);
return prev;
}
@@ -1185,7 +1228,7 @@ void __i915_request_queue(struct i915_request *rq,
void i915_request_add(struct i915_request *rq)
{
struct i915_sched_attr attr = rq->gem_context->sched;
- struct intel_timeline * const tl = rq->timeline;
+ struct intel_timeline * const tl = i915_request_timeline(rq);
struct i915_request *prev;
lockdep_assert_held(&tl->mutex);
@@ -1240,7 +1283,9 @@ void i915_request_add(struct i915_request *rq)
* work on behalf of others -- but instead we should benefit from
* improved resource management. (Well, that's the theory at least.)
*/
- if (prev && i915_request_completed(prev) && prev->timeline == tl)
+ if (prev &&
+ i915_request_completed(prev) &&
+ rcu_access_pointer(prev->timeline) == tl)
i915_request_retire_upto(prev);
mutex_unlock(&tl->mutex);
@@ -1403,7 +1448,7 @@ long i915_request_wait(struct i915_request *rq,
* completion. That requires having a good predictor for the request
* duration, which we currently lack.
*/
- if (CONFIG_DRM_I915_SPIN_REQUEST &&
+ if (IS_ACTIVE(CONFIG_DRM_I915_SPIN_REQUEST) &&
__i915_spin_request(rq, state, CONFIG_DRM_I915_SPIN_REQUEST)) {
dma_fence_signal(&rq->fence);
goto out;
@@ -1423,7 +1468,7 @@ long i915_request_wait(struct i915_request *rq,
*/
if (flags & I915_WAIT_PRIORITY) {
if (!i915_request_started(rq) && INTEL_GEN(rq->i915) >= 6)
- gen6_rps_boost(rq);
+ intel_rps_boost(rq);
i915_schedule_bump_priority(rq, I915_PRIORITY_WAIT);
}
@@ -1449,6 +1494,7 @@ long i915_request_wait(struct i915_request *rq,
break;
}
+ intel_engine_flush_submission(rq->engine);
timeout = io_schedule_timeout(timeout);
}
__set_current_state(TASK_RUNNING);
@@ -1456,53 +1502,11 @@ long i915_request_wait(struct i915_request *rq,
dma_fence_remove_callback(&rq->fence, &wait.cb);
out:
- mutex_release(&rq->engine->gt->reset.mutex.dep_map, 0, _THIS_IP_);
+ mutex_release(&rq->engine->gt->reset.mutex.dep_map, _THIS_IP_);
trace_i915_request_wait_end(rq);
return timeout;
}
-bool i915_retire_requests(struct drm_i915_private *i915)
-{
- struct intel_gt_timelines *timelines = &i915->gt.timelines;
- struct intel_timeline *tl, *tn;
- unsigned long flags;
- LIST_HEAD(free);
-
- spin_lock_irqsave(&timelines->lock, flags);
- list_for_each_entry_safe(tl, tn, &timelines->active_list, link) {
- if (!mutex_trylock(&tl->mutex))
- continue;
-
- intel_timeline_get(tl);
- GEM_BUG_ON(!tl->active_count);
- tl->active_count++; /* pin the list element */
- spin_unlock_irqrestore(&timelines->lock, flags);
-
- retire_requests(tl);
-
- spin_lock_irqsave(&timelines->lock, flags);
-
- /* Resume iteration after dropping lock */
- list_safe_reset_next(tl, tn, link);
- if (!--tl->active_count)
- list_del(&tl->link);
-
- mutex_unlock(&tl->mutex);
-
- /* Defer the final release to after the spinlock */
- if (refcount_dec_and_test(&tl->kref.refcount)) {
- GEM_BUG_ON(tl->active_count);
- list_add(&tl->link, &free);
- }
- }
- spin_unlock_irqrestore(&timelines->lock, flags);
-
- list_for_each_entry_safe(tl, tn, &free, link)
- __intel_timeline_free(&tl->kref);
-
- return !list_empty(&timelines->active_list);
-}
-
#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
#include "selftests/mock_request.c"
#include "selftests/i915_request.c"