// SPDX-License-Identifier: MIT /* * Copyright © 2020 Intel Corporation */ #include "gen6_engine_cs.h" #include "intel_engine.h" #include "intel_gpu_commands.h" #include "intel_gt.h" #include "intel_gt_irq.h" #include "intel_gt_pm_irq.h" #include "intel_ring.h" #define HWS_SCRATCH_ADDR (I915_GEM_HWS_SCRATCH * sizeof(u32)) /* * Emits a PIPE_CONTROL with a non-zero post-sync operation, for * implementing two workarounds on gen6. From section 1.4.7.1 * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1: * * [DevSNB-C+{W/A}] Before any depth stall flush (including those * produced by non-pipelined state commands), software needs to first * send a PIPE_CONTROL with no bits set except Post-Sync Operation != * 0. * * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable * =1, a PIPE_CONTROL with any non-zero post-sync-op is required. * * And the workaround for these two requires this workaround first: * * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent * BEFORE the pipe-control with a post-sync op and no write-cache * flushes. * * And this last workaround is tricky because of the requirements on * that bit. From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM * volume 2 part 1: * * "1 of the following must also be set: * - Render Target Cache Flush Enable ([12] of DW1) * - Depth Cache Flush Enable ([0] of DW1) * - Stall at Pixel Scoreboard ([1] of DW1) * - Depth Stall ([13] of DW1) * - Post-Sync Operation ([13] of DW1) * - Notify Enable ([8] of DW1)" * * The cache flushes require the workaround flush that triggered this * one, so we can't use it. Depth stall would trigger the same. * Post-sync nonzero is what triggered this second workaround, so we * can't use that one either. Notify enable is IRQs, which aren't * really our business. That leaves only stall at scoreboard. */ static int gen6_emit_post_sync_nonzero_flush(struct i915_request *rq) { u32 scratch_addr = intel_gt_scratch_offset(rq->engine->gt, INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH); u32 *cs; cs = intel_ring_begin(rq, 6); if (IS_ERR(cs)) return PTR_ERR(cs); *cs++ = GFX_OP_PIPE_CONTROL(5); *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD; *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT; *cs++ = 0; /* low dword */ *cs++ = 0; /* high dword */ *cs++ = MI_NOOP; intel_ring_advance(rq, cs); cs = intel_ring_begin(rq, 6); if (IS_ERR(cs)) return PTR_ERR(cs); *cs++ = GFX_OP_PIPE_CONTROL(5); *cs++ = PIPE_CONTROL_QW_WRITE; *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT; *cs++ = 0; *cs++ = 0; *cs++ = MI_NOOP; intel_ring_advance(rq, cs); return 0; } int gen6_emit_flush_rcs(struct i915_request *rq, u32 mode) { u32 scratch_addr = intel_gt_scratch_offset(rq->engine->gt, INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH); u32 *cs, flags = 0; int ret; /* Force SNB workarounds for PIPE_CONTROL flushes */ ret = gen6_emit_post_sync_nonzero_flush(rq); if (ret) return ret; /* * Just flush everything. Experiments have shown that reducing the * number of bits based on the write domains has little performance * impact. And when rearranging requests, the order of flushes is * unknown. */ if (mode & EMIT_FLUSH) { flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; /* * Ensure that any following seqno writes only happen * when the render cache is indeed flushed. */ flags |= PIPE_CONTROL_CS_STALL; } if (mode & EMIT_INVALIDATE) { flags |= PIPE_CONTROL_TLB_INVALIDATE; flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; /* * TLB invalidate requires a post-sync write. */ flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL; } cs = intel_ring_begin(rq, 4); if (IS_ERR(cs)) return PTR_ERR(cs); *cs++ = GFX_OP_PIPE_CONTROL(4); *cs++ = flags; *cs++ = scratch_addr | PIPE_CONTROL_GLOBAL_GTT; *cs++ = 0; intel_ring_advance(rq, cs); return 0; } u32 *gen6_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs) { /* First we do the gen6_emit_post_sync_nonzero_flush w/a */ *cs++ = GFX_OP_PIPE_CONTROL(4); *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD; *cs++ = 0; *cs++ = 0; *cs++ = GFX_OP_PIPE_CONTROL(4); *cs++ = PIPE_CONTROL_QW_WRITE; *cs++ = intel_gt_scratch_offset(rq->engine->gt, INTEL_GT_SCRATCH_FIELD_DEFAULT) | PIPE_CONTROL_GLOBAL_GTT; *cs++ = 0; /* Finally we can flush and with it emit the breadcrumb */ *cs++ = GFX_OP_PIPE_CONTROL(4); *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | PIPE_CONTROL_DEPTH_CACHE_FLUSH | PIPE_CONTROL_DC_FLUSH_ENABLE | PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL); *cs++ = i915_request_active_timeline(rq)->hwsp_offset | PIPE_CONTROL_GLOBAL_GTT; *cs++ = rq->fence.seqno; *cs++ = MI_USER_INTERRUPT; *cs++ = MI_NOOP; rq->tail = intel_ring_offset(rq, cs); assert_ring_tail_valid(rq->ring, rq->tail); return cs; } static int mi_flush_dw(struct i915_request *rq, u32 flags) { u32 cmd, *cs; cs = intel_ring_begin(rq, 4); if (IS_ERR(cs)) return PTR_ERR(cs); cmd = MI_FLUSH_DW; /* * We always require a command barrier so that subsequent * commands, such as breadcrumb interrupts, are strictly ordered * wrt the contents of the write cache being flushed to memory * (and thus being coherent from the CPU). */ cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW; /* * Bspec vol 1c.3 - blitter engine command streamer: * "If ENABLED, all TLBs will be invalidated once the flush * operation is complete. This bit is only valid when the * Post-Sync Operation field is a value of 1h or 3h." */ cmd |= flags; *cs++ = cmd; *cs++ = HWS_SCRATCH_ADDR | MI_FLUSH_DW_USE_GTT; *cs++ = 0; *cs++ = MI_NOOP; intel_ring_advance(rq, cs); return 0; } static int gen6_flush_dw(struct i915_request *rq, u32 mode, u32 invflags) { return mi_flush_dw(rq, mode & EMIT_INVALIDATE ? invflags : 0); } int gen6_emit_flush_xcs(struct i915_request *rq, u32 mode) { return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB); } int gen6_emit_flush_vcs(struct i915_request *rq, u32 mode) { return gen6_flush_dw(rq, mode, MI_INVALIDATE_TLB | MI_INVALIDATE_BSD); } int gen6_emit_bb_start(struct i915_request *rq, u64 offset, u32 len, unsigned int dispatch_flags) { u32 security; u32 *cs; security = MI_BATCH_NON_SECURE_I965; if (dispatch_flags & I915_DISPATCH_SECURE) security = 0; cs = intel_ring_begin(rq, 2); if (IS_ERR(cs)) return PTR_ERR(cs); cs = __gen6_emit_bb_start(cs, offset, security); intel_ring_advance(rq, cs); return 0; } int hsw_emit_bb_start(struct i915_request *rq, u64 offset, u32 len, unsigned int dispatch_flags) { u32 security; u32 *cs; security = MI_BATCH_PPGTT_HSW | MI_BATCH_NON_SECURE_HSW; if (dispatch_flags & I915_DISPATCH_SECURE) security = 0; cs = intel_ring_begin(rq, 2); if (IS_ERR(cs)) return PTR_ERR(cs); cs = __gen6_emit_bb_start(cs, offset, security); intel_ring_advance(rq, cs); return 0; } static int gen7_stall_cs(struct i915_request *rq) { u32 *cs; cs = intel_ring_begin(rq, 4); if (IS_ERR(cs)) return PTR_ERR(cs); *cs++ = GFX_OP_PIPE_CONTROL(4); *cs++ = PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD; *cs++ = 0; *cs++ = 0; intel_ring_advance(rq, cs); return 0; } int gen7_emit_flush_rcs(struct i915_request *rq, u32 mode) { u32 scratch_addr = intel_gt_scratch_offset(rq->engine->gt, INTEL_GT_SCRATCH_FIELD_RENDER_FLUSH); u32 *cs, flags = 0; /* * Ensure that any following seqno writes only happen when the render * cache is indeed flushed. * * Workaround: 4th PIPE_CONTROL command (except the ones with only * read-cache invalidate bits set) must have the CS_STALL bit set. We * don't try to be clever and just set it unconditionally. */ flags |= PIPE_CONTROL_CS_STALL; /* * CS_STALL suggests at least a post-sync write. */ flags |= PIPE_CONTROL_QW_WRITE; flags |= PIPE_CONTROL_GLOBAL_GTT_IVB; /* * Just flush everything. Experiments have shown that reducing the * number of bits based on the write domains has little performance * impact. */ if (mode & EMIT_FLUSH) { flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; flags |= PIPE_CONTROL_DC_FLUSH_ENABLE; flags |= PIPE_CONTROL_FLUSH_ENABLE; } if (mode & EMIT_INVALIDATE) { flags |= PIPE_CONTROL_TLB_INVALIDATE; flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR; /* * Workaround: we must issue a pipe_control with CS-stall bit * set before a pipe_control command that has the state cache * invalidate bit set. */ gen7_stall_cs(rq); } cs = intel_ring_begin(rq, 4); if (IS_ERR(cs)) return PTR_ERR(cs); *cs++ = GFX_OP_PIPE_CONTROL(4); *cs++ = flags; *cs++ = scratch_addr; *cs++ = 0; intel_ring_advance(rq, cs); return 0; } u32 *gen7_emit_breadcrumb_rcs(struct i915_request *rq, u32 *cs) { *cs++ = GFX_OP_PIPE_CONTROL(4); *cs++ = (PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | PIPE_CONTROL_DEPTH_CACHE_FLUSH | PIPE_CONTROL_DC_FLUSH_ENABLE | PIPE_CONTROL_FLUSH_ENABLE | PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_GLOBAL_GTT_IVB | PIPE_CONTROL_CS_STALL); *cs++ = i915_request_active_timeline(rq)->hwsp_offset; *cs++ = rq->fence.seqno; *cs++ = MI_USER_INTERRUPT; *cs++ = MI_NOOP; rq->tail = intel_ring_offset(rq, cs); assert_ring_tail_valid(rq->ring, rq->tail); return cs; } u32 *gen6_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs) { GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma); GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR); *cs++ = MI_FLUSH_DW | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX; *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT; *cs++ = rq->fence.seqno; *cs++ = MI_USER_INTERRUPT; rq->tail = intel_ring_offset(rq, cs); assert_ring_tail_valid(rq->ring, rq->tail); return cs; } #define GEN7_XCS_WA 32 u32 *gen7_emit_breadcrumb_xcs(struct i915_request *rq, u32 *cs) { int i; GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma); GEM_BUG_ON(offset_in_page(i915_request_active_timeline(rq)->hwsp_offset) != I915_GEM_HWS_SEQNO_ADDR); *cs++ = MI_FLUSH_DW | MI_INVALIDATE_TLB | MI_FLUSH_DW_OP_STOREDW | MI_FLUSH_DW_STORE_INDEX; *cs++ = I915_GEM_HWS_SEQNO_ADDR | MI_FLUSH_DW_USE_GTT; *cs++ = rq->fence.seqno; for (i = 0; i < GEN7_XCS_WA; i++) { *cs++ = MI_STORE_DWORD_INDEX; *cs++ = I915_GEM_HWS_SEQNO_ADDR; *cs++ = rq->fence.seqno; } *cs++ = MI_FLUSH_DW; *cs++ = 0; *cs++ = 0; *cs++ = MI_USER_INTERRUPT; *cs++ = MI_NOOP; rq->tail = intel_ring_offset(rq, cs); assert_ring_tail_valid(rq->ring, rq->tail); return cs; } #undef GEN7_XCS_WA void gen6_irq_enable(struct intel_engine_cs *engine) { ENGINE_WRITE(engine, RING_IMR, ~(engine->irq_enable_mask | engine->irq_keep_mask)); /* Flush/delay to ensure the RING_IMR is active before the GT IMR */ ENGINE_POSTING_READ(engine, RING_IMR); gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask); } void gen6_irq_disable(struct intel_engine_cs *engine) { ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask); gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask); } void hsw_irq_enable_vecs(struct intel_engine_cs *engine) { ENGINE_WRITE(engine, RING_IMR, ~engine->irq_enable_mask); /* Flush/delay to ensure the RING_IMR is active before the GT IMR */ ENGINE_POSTING_READ(engine, RING_IMR); gen6_gt_pm_unmask_irq(engine->gt, engine->irq_enable_mask); } void hsw_irq_disable_vecs(struct intel_engine_cs *engine) { ENGINE_WRITE(engine, RING_IMR, ~0); gen6_gt_pm_mask_irq(engine->gt, engine->irq_enable_mask); }