// SPDX-License-Identifier: MIT /* * Copyright © 2020 Intel Corporation */ #include "gen2_engine_cs.h" #include "i915_drv.h" #include "intel_engine.h" #include "intel_gpu_commands.h" #include "intel_gt.h" #include "intel_gt_irq.h" #include "intel_ring.h" int gen2_emit_flush(struct i915_request *rq, u32 mode) { unsigned int num_store_dw = 12; u32 cmd, *cs; cmd = MI_FLUSH; if (mode & EMIT_INVALIDATE) cmd |= MI_READ_FLUSH; cs = intel_ring_begin(rq, 2 + 4 * num_store_dw); if (IS_ERR(cs)) return PTR_ERR(cs); *cs++ = cmd; while (num_store_dw--) { *cs++ = MI_STORE_DWORD_INDEX; *cs++ = I915_GEM_HWS_SCRATCH * sizeof(u32); *cs++ = 0; *cs++ = MI_FLUSH | MI_NO_WRITE_FLUSH; } *cs++ = cmd; intel_ring_advance(rq, cs); return 0; } int gen4_emit_flush_rcs(struct i915_request *rq, u32 mode) { u32 cmd, *cs; int i; /* * read/write caches: * * I915_GEM_DOMAIN_RENDER is always invalidated, but is * only flushed if MI_NO_WRITE_FLUSH is unset. On 965, it is * also flushed at 2d versus 3d pipeline switches. * * read-only caches: * * I915_GEM_DOMAIN_SAMPLER is flushed on pre-965 if * MI_READ_FLUSH is set, and is always flushed on 965. * * I915_GEM_DOMAIN_COMMAND may not exist? * * I915_GEM_DOMAIN_INSTRUCTION, which exists on 965, is * invalidated when MI_EXE_FLUSH is set. * * I915_GEM_DOMAIN_VERTEX, which exists on 965, is * invalidated with every MI_FLUSH. * * TLBs: * * On 965, TLBs associated with I915_GEM_DOMAIN_COMMAND * and I915_GEM_DOMAIN_CPU in are invalidated at PTE write and * I915_GEM_DOMAIN_RENDER and I915_GEM_DOMAIN_SAMPLER * are flushed at any MI_FLUSH. */ cmd = MI_FLUSH; if (mode & EMIT_INVALIDATE) { cmd |= MI_EXE_FLUSH; if (IS_G4X(rq->engine->i915) || GRAPHICS_VER(rq->engine->i915) == 5) cmd |= MI_INVALIDATE_ISP; } i = 2; if (mode & EMIT_INVALIDATE) i += 20; cs = intel_ring_begin(rq, i); if (IS_ERR(cs)) return PTR_ERR(cs); *cs++ = cmd; /* * A random delay to let the CS invalidate take effect? Without this * delay, the GPU relocation path fails as the CS does not see * the updated contents. Just as important, if we apply the flushes * to the EMIT_FLUSH branch (i.e. immediately after the relocation * write and before the invalidate on the next batch), the relocations * still fail. This implies that is a delay following invalidation * that is required to reset the caches as opposed to a delay to * ensure the memory is written. */ if (mode & EMIT_INVALIDATE) { *cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE; *cs++ = intel_gt_scratch_offset(rq->engine->gt, INTEL_GT_SCRATCH_FIELD_DEFAULT) | PIPE_CONTROL_GLOBAL_GTT; *cs++ = 0; *cs++ = 0; for (i = 0; i < 12; i++) *cs++ = MI_FLUSH; *cs++ = GFX_OP_PIPE_CONTROL(4) | PIPE_CONTROL_QW_WRITE; *cs++ = intel_gt_scratch_offset(rq->engine->gt, INTEL_GT_SCRATCH_FIELD_DEFAULT) | PIPE_CONTROL_GLOBAL_GTT; *cs++ = 0; *cs++ = 0; } *cs++ = cmd; intel_ring_advance(rq, cs); return 0; } int gen4_emit_flush_vcs(struct i915_request *rq, u32 mode) { u32 *cs; cs = intel_ring_begin(rq, 2); if (IS_ERR(cs)) return PTR_ERR(cs); *cs++ = MI_FLUSH; *cs++ = MI_NOOP; intel_ring_advance(rq, cs); return 0; } static u32 *__gen2_emit_breadcrumb(struct i915_request *rq, u32 *cs, int flush, int post) { GEM_BUG_ON(i915_request_active_timeline(rq)->hwsp_ggtt != rq->engine->status_page.vma); GEM_BUG_ON(offset_in_page(rq->hwsp_seqno) != I915_GEM_HWS_SEQNO_ADDR); *cs++ = MI_FLUSH; while (flush--) { *cs++ = MI_STORE_DWORD_INDEX; *cs++ = I915_GEM_HWS_SCRATCH * sizeof(u32); *cs++ = rq->fence.seqno; } while (post--) { *cs++ = MI_STORE_DWORD_INDEX; *cs++ = I915_GEM_HWS_SEQNO_ADDR; *cs++ = rq->fence.seqno; } *cs++ = MI_USER_INTERRUPT; rq->tail = intel_ring_offset(rq, cs); assert_ring_tail_valid(rq->ring, rq->tail); return cs; } u32 *gen3_emit_breadcrumb(struct i915_request *rq, u32 *cs) { return __gen2_emit_breadcrumb(rq, cs, 16, 8); } u32 *gen5_emit_breadcrumb(struct i915_request *rq, u32 *cs) { return __gen2_emit_breadcrumb(rq, cs, 8, 8); } /* Just userspace ABI convention to limit the wa batch bo to a resonable size */ #define I830_BATCH_LIMIT SZ_256K #define I830_TLB_ENTRIES (2) #define I830_WA_SIZE max(I830_TLB_ENTRIES * SZ_4K, I830_BATCH_LIMIT) int i830_emit_bb_start(struct i915_request *rq, u64 offset, u32 len, unsigned int dispatch_flags) { u32 *cs, cs_offset = intel_gt_scratch_offset(rq->engine->gt, INTEL_GT_SCRATCH_FIELD_DEFAULT); GEM_BUG_ON(rq->engine->gt->scratch->size < I830_WA_SIZE); cs = intel_ring_begin(rq, 6); if (IS_ERR(cs)) return PTR_ERR(cs); /* Evict the invalid PTE TLBs */ *cs++ = COLOR_BLT_CMD | BLT_WRITE_RGBA; *cs++ = BLT_DEPTH_32 | BLT_ROP_COLOR_COPY | 4096; *cs++ = I830_TLB_ENTRIES << 16 | 4; /* load each page */ *cs++ = cs_offset; *cs++ = 0xdeadbeef; *cs++ = MI_NOOP; intel_ring_advance(rq, cs); if ((dispatch_flags & I915_DISPATCH_PINNED) == 0) { if (len > I830_BATCH_LIMIT) return -ENOSPC; cs = intel_ring_begin(rq, 6 + 2); if (IS_ERR(cs)) return PTR_ERR(cs); /* * Blit the batch (which has now all relocs applied) to the * stable batch scratch bo area (so that the CS never * stumbles over its tlb invalidation bug) ... */ *cs++ = SRC_COPY_BLT_CMD | BLT_WRITE_RGBA | (6 - 2); *cs++ = BLT_DEPTH_32 | BLT_ROP_SRC_COPY | 4096; *cs++ = DIV_ROUND_UP(len, 4096) << 16 | 4096; *cs++ = cs_offset; *cs++ = 4096; *cs++ = offset; *cs++ = MI_FLUSH; *cs++ = MI_NOOP; intel_ring_advance(rq, cs); /* ... and execute it. */ offset = cs_offset; } if (!(dispatch_flags & I915_DISPATCH_SECURE)) offset |= MI_BATCH_NON_SECURE; cs = intel_ring_begin(rq, 2); if (IS_ERR(cs)) return PTR_ERR(cs); *cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT; *cs++ = offset; intel_ring_advance(rq, cs); return 0; } int gen3_emit_bb_start(struct i915_request *rq, u64 offset, u32 len, unsigned int dispatch_flags) { u32 *cs; if (!(dispatch_flags & I915_DISPATCH_SECURE)) offset |= MI_BATCH_NON_SECURE; cs = intel_ring_begin(rq, 2); if (IS_ERR(cs)) return PTR_ERR(cs); *cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT; *cs++ = offset; intel_ring_advance(rq, cs); return 0; } int gen4_emit_bb_start(struct i915_request *rq, u64 offset, u32 length, unsigned int dispatch_flags) { u32 security; u32 *cs; security = MI_BATCH_NON_SECURE_I965; if (dispatch_flags & I915_DISPATCH_SECURE) security = 0; cs = intel_ring_begin(rq, 2); if (IS_ERR(cs)) return PTR_ERR(cs); *cs++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT | security; *cs++ = offset; intel_ring_advance(rq, cs); return 0; } void gen2_irq_enable(struct intel_engine_cs *engine) { struct drm_i915_private *i915 = engine->i915; i915->irq_mask &= ~engine->irq_enable_mask; intel_uncore_write16(&i915->uncore, GEN2_IMR, i915->irq_mask); ENGINE_POSTING_READ16(engine, RING_IMR); } void gen2_irq_disable(struct intel_engine_cs *engine) { struct drm_i915_private *i915 = engine->i915; i915->irq_mask |= engine->irq_enable_mask; intel_uncore_write16(&i915->uncore, GEN2_IMR, i915->irq_mask); } void gen3_irq_enable(struct intel_engine_cs *engine) { engine->i915->irq_mask &= ~engine->irq_enable_mask; intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask); intel_uncore_posting_read_fw(engine->uncore, GEN2_IMR); } void gen3_irq_disable(struct intel_engine_cs *engine) { engine->i915->irq_mask |= engine->irq_enable_mask; intel_uncore_write(engine->uncore, GEN2_IMR, engine->i915->irq_mask); } void gen5_irq_enable(struct intel_engine_cs *engine) { gen5_gt_enable_irq(engine->gt, engine->irq_enable_mask); } void gen5_irq_disable(struct intel_engine_cs *engine) { gen5_gt_disable_irq(engine->gt, engine->irq_enable_mask); }