drm/i915: Signal first fence from irq handler if complete

As execlists and other non-semaphore multi-engine devices coordinate between engines using interrupts, we can shave off a few 10s of microsecond of scheduling latency by doing the fence signaling from the interrupt as opposed to a RT kthread. (Realistically the delay adds about 1% to an individual cross-engine workload.) We only signal the first fence in order to limit the amount of work we move into the interrupt handler. We also have to remember that our breadcrumbs may be unordered with respect to the interrupt and so we still require the waiter process to perform some heavyweight coherency fixups, as well as traversing the tree of waiters. v2: No need for early exit in irq handler - it breaks the flow between patches and prevents the tracepoint v3: Restore rcu hold across irq signaling of request Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20170227205850.2828-2-chris@chris-wilson.co.uk
author: Chris Wilson <chris@chris-wilson.co.uk> 2017-02-27 20:58:48 +0000
committer: Chris Wilson <chris@chris-wilson.co.uk> 2017-02-27 21:57:20 +0000
commit: 56299fb7d9047cc1d25362827073b2ac0984ed21 (patch)
tree: 311ae5bcaec77364beb1a0514886e73534a5516d /drivers/gpu/drm/i915
parent: drm/i915: Report both waiters and success from intel_engine_wakeup() (diff)
download: linux-dev-56299fb7d9047cc1d25362827073b2ac0984ed21.tar.xz
linux-dev-56299fb7d9047cc1d25362827073b2ac0984ed21.zip
6 files changed, 54 insertions, 39 deletions
diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
index 933874ddea75..71152656ebbf 100644
--- a/drivers/gpu/drm/i915/i915_drv.h
+++ b/drivers/gpu/drm/i915/i915_drv.h
@@ -4060,9 +4060,9 @@ __i915_request_irq_complete(const struct drm_i915_gem_request *req)
 	 * is woken.
 	 */
 	if (engine->irq_seqno_barrier &&
-	    rcu_access_pointer(engine->breadcrumbs.irq_seqno_bh) == current &&
 	    test_and_clear_bit(ENGINE_IRQ_BREADCRUMB, &engine->irq_posted)) {
-		struct task_struct *tsk;
+		struct intel_breadcrumbs *b = &engine->breadcrumbs;
+		unsigned long flags;
 
 		/* The ordering of irq_posted versus applying the barrier
 		 * is crucial. The clearing of the current irq_posted must
@@ -4084,17 +4084,16 @@ __i915_request_irq_complete(const struct drm_i915_gem_request *req)
 		 * the seqno before we believe it coherent since they see
 		 * irq_posted == false but we are still running).
 		 */
-		rcu_read_lock();
-		tsk = rcu_dereference(engine->breadcrumbs.irq_seqno_bh);
-		if (tsk && tsk != current)
+		spin_lock_irqsave(&b->lock, flags);
+		if (b->first_wait && b->first_wait->tsk != current)
 			/* Note that if the bottom-half is changed as we
 			 * are sending the wake-up, the new bottom-half will
 			 * be woken by whomever made the change. We only have
 			 * to worry about when we steal the irq-posted for
 			 * ourself.
 			 */
-			wake_up_process(tsk);
-		rcu_read_unlock();
+			wake_up_process(b->first_wait->tsk);
+		spin_unlock_irqrestore(&b->lock, flags);
 
 		if (__i915_gem_request_completed(req, seqno))
 			return true;
diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c
index fbfeb5f8d069..77c3ee7a3fd0 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -1083,7 +1083,7 @@ long i915_wait_request(struct drm_i915_gem_request *req,
 	if (flags & I915_WAIT_LOCKED)
 		add_wait_queue(errq, &reset);
 
-	intel_wait_init(&wait);
+	intel_wait_init(&wait, req);
 
 restart:
 	do {
diff --git a/drivers/gpu/drm/i915/i915_gem_request.h b/drivers/gpu/drm/i915/i915_gem_request.h
index 5f73d8c0a38a..0efee879df23 100644
--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -32,10 +32,12 @@
 
 struct drm_file;
 struct drm_i915_gem_object;
+struct drm_i915_gem_request;
 
 struct intel_wait {
 	struct rb_node node;
 	struct task_struct *tsk;
+	struct drm_i915_gem_request *request;
 	u32 seqno;
 };
 
diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
index 312d30e96dc2..e06e6eb99801 100644
--- a/drivers/gpu/drm/i915/i915_irq.c
+++ b/drivers/gpu/drm/i915/i915_irq.c
@@ -1033,12 +1033,42 @@ static void ironlake_rps_change_irq_handler(struct drm_i915_private *dev_priv)
 
 static void notify_ring(struct intel_engine_cs *engine)
 {
-	bool waiters;
+	struct drm_i915_gem_request *rq = NULL;
+	struct intel_wait *wait;
 
 	atomic_inc(&engine->irq_count);
 	set_bit(ENGINE_IRQ_BREADCRUMB, &engine->irq_posted);
-	waiters = intel_engine_wakeup(engine);
-	trace_intel_engine_notify(engine, waiters);
+
+	rcu_read_lock();
+
+	spin_lock(&engine->breadcrumbs.lock);
+	wait = engine->breadcrumbs.first_wait;
+	if (wait) {
+		/* We use a callback from the dma-fence to submit
+		 * requests after waiting on our own requests. To
+		 * ensure minimum delay in queuing the next request to
+		 * hardware, signal the fence now rather than wait for
+		 * the signaler to be woken up. We still wake up the
+		 * waiter in order to handle the irq-seqno coherency
+		 * issues (we may receive the interrupt before the
+		 * seqno is written, see __i915_request_irq_complete())
+		 * and to handle coalescing of multiple seqno updates
+		 * and many waiters.
+		 */
+		if (i915_seqno_passed(intel_engine_get_seqno(engine),
+				      wait->seqno))
+			rq = wait->request;
+
+		wake_up_process(wait->tsk);
+	}
+	spin_unlock(&engine->breadcrumbs.lock);
+
+	if (rq)
+		dma_fence_signal(&rq->fence);
+
+	rcu_read_unlock();
+
+	trace_intel_engine_notify(engine, wait);
 }
 
 static void vlv_c0_read(struct drm_i915_private *dev_priv,
diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c
index c8361f350350..e7f4a4c923a2 100644
--- a/drivers/gpu/drm/i915/intel_breadcrumbs.c
+++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c
@@ -28,26 +28,18 @@
 
 unsigned int intel_engine_wakeup(struct intel_engine_cs *engine)
 {
+	struct intel_wait *wait;
+	unsigned long flags;
 	unsigned int result = 0;
 
-	/* Note that for this not to dangerously chase a dangling pointer,
-	 * we must hold the rcu_read_lock here.
-	 *
-	 * Also note that tsk is likely to be in !TASK_RUNNING state so an
-	 * early test for tsk->state != TASK_RUNNING before wake_up_process()
-	 * is unlikely to be beneficial.
-	 */
-	if (intel_engine_has_waiter(engine)) {
-		struct task_struct *tsk;
-
+	spin_lock_irqsave(&engine->breadcrumbs.lock, flags);
+	wait = engine->breadcrumbs.first_wait;
+	if (wait) {
 		result = ENGINE_WAKEUP_WAITER;
-
-		rcu_read_lock();
-		tsk = rcu_dereference(engine->breadcrumbs.irq_seqno_bh);
-		if (tsk && !wake_up_process(tsk))
+		if (!wake_up_process(wait->tsk))
 			result |= ENGINE_WAKEUP_ACTIVE;
-		rcu_read_unlock();
 	}
+	spin_unlock_irqrestore(&engine->breadcrumbs.lock, flags);
 
 	return result;
 }
@@ -301,7 +293,6 @@ static bool __intel_engine_add_wait(struct intel_engine_cs *engine,
 	}
 	rb_link_node(&wait->node, parent, p);
 	rb_insert_color(&wait->node, &b->waiters);
-	GEM_BUG_ON(!first && !rcu_access_pointer(b->irq_seqno_bh));
 
 	if (completed) {
 		struct rb_node *next = rb_next(completed);
@@ -310,7 +301,6 @@ static bool __intel_engine_add_wait(struct intel_engine_cs *engine,
 		if (next && next != &wait->node) {
 			GEM_BUG_ON(first);
 			b->first_wait = to_wait(next);
-			rcu_assign_pointer(b->irq_seqno_bh, b->first_wait->tsk);
 			/* As there is a delay between reading the current
 			 * seqno, processing the completed tasks and selecting
 			 * the next waiter, we may have missed the interrupt
@@ -337,7 +327,6 @@ static bool __intel_engine_add_wait(struct intel_engine_cs *engine,
 	if (first) {
 		GEM_BUG_ON(rb_first(&b->waiters) != &wait->node);
 		b->first_wait = wait;
-		rcu_assign_pointer(b->irq_seqno_bh, wait->tsk);
 		/* After assigning ourselves as the new bottom-half, we must
 		 * perform a cursory check to prevent a missed interrupt.
 		 * Either we miss the interrupt whilst programming the hardware,
@@ -348,7 +337,6 @@ static bool __intel_engine_add_wait(struct intel_engine_cs *engine,
 		 */
 		__intel_breadcrumbs_enable_irq(b);
 	}
-	GEM_BUG_ON(!rcu_access_pointer(b->irq_seqno_bh));
 	GEM_BUG_ON(!b->first_wait);
 	GEM_BUG_ON(rb_first(&b->waiters) != &b->first_wait->node);
 
@@ -396,8 +384,6 @@ static void __intel_engine_remove_wait(struct intel_engine_cs *engine,
 		const int priority = wakeup_priority(b, wait->tsk);
 		struct rb_node *next;
 
-		GEM_BUG_ON(rcu_access_pointer(b->irq_seqno_bh) != wait->tsk);
-
 		/* We are the current bottom-half. Find the next candidate,
 		 * the first waiter in the queue on the remaining oldest
 		 * request. As multiple seqnos may complete in the time it
@@ -439,13 +425,11 @@ static void __intel_engine_remove_wait(struct intel_engine_cs *engine,
 			 * exception rather than a seqno completion.
 			 */
 			b->first_wait = to_wait(next);
-			rcu_assign_pointer(b->irq_seqno_bh, b->first_wait->tsk);
 			if (b->first_wait->seqno != wait->seqno)
 				__intel_breadcrumbs_enable_irq(b);
 			wake_up_process(b->first_wait->tsk);
 		} else {
 			b->first_wait = NULL;
-			rcu_assign_pointer(b->irq_seqno_bh, NULL);
 			__intel_breadcrumbs_disable_irq(b);
 		}
 	} else {
@@ -459,7 +443,6 @@ out:
 	GEM_BUG_ON(b->first_wait == wait);
 	GEM_BUG_ON(rb_first(&b->waiters) !=
 		   (b->first_wait ? &b->first_wait->node : NULL));
-	GEM_BUG_ON(!rcu_access_pointer(b->irq_seqno_bh) ^ RB_EMPTY_ROOT(&b->waiters));
 }
 
 void intel_engine_remove_wait(struct intel_engine_cs *engine,
@@ -621,6 +604,7 @@ void intel_engine_enable_signaling(struct drm_i915_gem_request *request)
 		return;
 
 	request->signaling.wait.tsk = b->signaler;
+	request->signaling.wait.request = request;
 	request->signaling.wait.seqno = seqno;
 	i915_gem_request_get(request);
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index 7d753dc1b89d..974ec11d2b1d 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -235,8 +235,6 @@ struct intel_engine_cs {
 	 * the overhead of waking that client is much preferred.
 	 */
 	struct intel_breadcrumbs {
-		struct task_struct __rcu *irq_seqno_bh; /* bh for interrupts */
-
 		spinlock_t lock; /* protects the lists of requests; irqsafe */
 		struct rb_root waiters; /* sorted by retirement, priority */
 		struct rb_root signals; /* sorted by retirement */
@@ -582,9 +580,11 @@ static inline u32 intel_hws_seqno_address(struct intel_engine_cs *engine)
 /* intel_breadcrumbs.c -- user interrupt bottom-half for waiters */
 int intel_engine_init_breadcrumbs(struct intel_engine_cs *engine);
 
-static inline void intel_wait_init(struct intel_wait *wait)
+static inline void intel_wait_init(struct intel_wait *wait,
+				   struct drm_i915_gem_request *rq)
 {
 	wait->tsk = current;
+	wait->request = rq;
 }
 
 static inline void intel_wait_init_for_seqno(struct intel_wait *wait, u32 seqno)
@@ -639,7 +639,7 @@ void intel_engine_cancel_signaling(struct drm_i915_gem_request *request);
 
 static inline bool intel_engine_has_waiter(const struct intel_engine_cs *engine)
 {
-	return rcu_access_pointer(engine->breadcrumbs.irq_seqno_bh);
+	return READ_ONCE(engine->breadcrumbs.first_wait);
 }
 
 unsigned int intel_engine_wakeup(struct intel_engine_cs *engine);
author	Chris Wilson <chris@chris-wilson.co.uk>	2017-02-27 20:58:48 +0000
committer	Chris Wilson <chris@chris-wilson.co.uk>	2017-02-27 21:57:20 +0000
commit	56299fb7d9047cc1d25362827073b2ac0984ed21 (patch)
tree	311ae5bcaec77364beb1a0514886e73534a5516d /drivers/gpu/drm/i915
parent	drm/i915: Report both waiters and success from intel_engine_wakeup() (diff)
download	linux-dev-56299fb7d9047cc1d25362827073b2ac0984ed21.tar.xz linux-dev-56299fb7d9047cc1d25362827073b2ac0984ed21.zip