/** * Emits a PIPE_CONTROL with a non-zero post-sync operation, for * implementing two workarounds on gen6. From section 1.4.7.1 * "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1: * * [DevSNB-C+{W/A}] Before any depth stall flush (including those * produced by non-pipelined state commands), software needs to first * send a PIPE_CONTROL with no bits set except Post-Sync Operation != * 0. * * [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable * =1, a PIPE_CONTROL with any non-zero post-sync-op is required. * * And the workaround for these two requires this workaround first: * * [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent * BEFORE the pipe-control with a post-sync op and no write-cache * flushes. * * And this last workaround is tricky because of the requirements on * that bit. From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM * volume 2 part 1: * * "1 of the following must also be set: * - Render Target Cache Flush Enable ([12] of DW1) * - Depth Cache Flush Enable ([0] of DW1) * - Stall at Pixel Scoreboard ([1] of DW1) * - Depth Stall ([13] of DW1) * - Post-Sync Operation ([13] of DW1) * - Notify Enable ([8] of DW1)" * * The cache flushes require the workaround flush that triggered this * one, so we can't use it. Depth stall would trigger the same. * Post-sync nonzero is what triggered this second workaround, so we * can't use that one either. Notify enable is IRQs, which aren't * really our business. That leaves only stall at scoreboard. */ static int intel_emit_post_sync_nonzero_flush(struct intel_ring_buffer *ring) { struct pipe_control *pc = ring->private; u32 scratch_addr = pc->gtt_offset + 128; int ret; ret = intel_ring_begin(ring, 6); if (ret) return ret; intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(5)); intel_ring_emit(ring, PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD); intel_ring_emit(ring, scratch_addr | PIPE_CONTROL_GLOBAL_GTT); /* address */ intel_ring_emit(ring, 0); /* low dword */ intel_ring_emit(ring, 0); /* high dword */ intel_ring_emit(ring, MI_NOOP); intel_ring_advance(ring); ret = intel_ring_begin(ring, 6); if (ret) return ret; intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(5)); intel_ring_emit(ring, PIPE_CONTROL_QW_WRITE); intel_ring_emit(ring, scratch_addr | PIPE_CONTROL_GLOBAL_GTT); /* address */ intel_ring_emit(ring, 0); intel_ring_emit(ring, 0); intel_ring_emit(ring, MI_NOOP); intel_ring_advance(ring); return 0; }
int intel_ctx_workarounds_emit(struct i915_request *rq) { struct i915_workarounds *w = &rq->i915->workarounds; u32 *cs; int ret, i; if (w->count == 0) return 0; ret = rq->engine->emit_flush(rq, EMIT_BARRIER); if (ret) return ret; cs = intel_ring_begin(rq, (w->count * 2 + 2)); if (IS_ERR(cs)) return PTR_ERR(cs); *cs++ = MI_LOAD_REGISTER_IMM(w->count); for (i = 0; i < w->count; i++) { *cs++ = i915_mmio_reg_offset(w->reg[i].addr); *cs++ = w->reg[i].value; } *cs++ = MI_NOOP; intel_ring_advance(rq, cs); ret = rq->engine->emit_flush(rq, EMIT_BARRIER); if (ret) return ret; return 0; }
static int gpu_set(struct drm_i915_gem_object *obj, unsigned long offset, u32 v) { struct drm_i915_private *i915 = to_i915(obj->base.dev); struct drm_i915_gem_request *rq; struct i915_vma *vma; u32 *cs; int err; err = i915_gem_object_set_to_gtt_domain(obj, true); if (err) return err; vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, 0); if (IS_ERR(vma)) return PTR_ERR(vma); rq = i915_gem_request_alloc(i915->engine[RCS], i915->kernel_context); if (IS_ERR(rq)) { i915_vma_unpin(vma); return PTR_ERR(rq); } cs = intel_ring_begin(rq, 4); if (IS_ERR(cs)) { __i915_add_request(rq, false); i915_vma_unpin(vma); return PTR_ERR(cs); } if (INTEL_GEN(i915) >= 8) { *cs++ = MI_STORE_DWORD_IMM_GEN4 | 1 << 22; *cs++ = lower_32_bits(i915_ggtt_offset(vma) + offset); *cs++ = upper_32_bits(i915_ggtt_offset(vma) + offset); *cs++ = v; } else if (INTEL_GEN(i915) >= 4) { *cs++ = MI_STORE_DWORD_IMM_GEN4 | 1 << 22; *cs++ = 0; *cs++ = i915_ggtt_offset(vma) + offset; *cs++ = v; } else { *cs++ = MI_STORE_DWORD_IMM | 1 << 22; *cs++ = i915_ggtt_offset(vma) + offset; *cs++ = v; *cs++ = MI_NOOP; } intel_ring_advance(rq, cs); i915_vma_move_to_active(vma, rq, EXEC_OBJECT_WRITE); i915_vma_unpin(vma); reservation_object_lock(obj->resv, NULL); reservation_object_add_excl_fence(obj->resv, &rq->fence); reservation_object_unlock(obj->resv); __i915_add_request(rq, true); return 0; }
static int gen6_render_ring_flush(struct intel_ring_buffer *ring, u32 invalidate_domains, u32 flush_domains) { u32 flags = 0; struct pipe_control *pc = ring->private; u32 scratch_addr = pc->gtt_offset + 128; int ret; /* Force SNB workarounds for PIPE_CONTROL flushes */ ret = intel_emit_post_sync_nonzero_flush(ring); if (ret) return ret; /* Just flush everything. Experiments have shown that reducing the * number of bits based on the write domains has little performance * impact. */ if (flush_domains) { flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; /* * Ensure that any following seqno writes only happen * when the render cache is indeed flushed. */ flags |= PIPE_CONTROL_CS_STALL; } if (invalidate_domains) { flags |= PIPE_CONTROL_TLB_INVALIDATE; flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; /* * TLB invalidate requires a post-sync write. */ flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL; } ret = intel_ring_begin(ring, 4); if (ret) return ret; intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(4)); intel_ring_emit(ring, flags); intel_ring_emit(ring, scratch_addr | PIPE_CONTROL_GLOBAL_GTT); intel_ring_emit(ring, 0); intel_ring_advance(ring); return 0; }
static inline int mi_set_context(struct intel_ring_buffer *ring, struct i915_hw_context *new_context, u32 hw_flags) { int ret; /* w/a: If Flush TLB Invalidation Mode is enabled, driver must do a TLB * invalidation prior to MI_SET_CONTEXT. On GEN6 we don't set the value * explicitly, so we rely on the value at ring init, stored in * itlb_before_ctx_switch. */ if (IS_GEN6(ring->dev) && ring->itlb_before_ctx_switch) { ret = ring->flush(ring, I915_GEM_GPU_DOMAINS, 0); if (ret) return ret; } ret = intel_ring_begin(ring, 6); if (ret) return ret; /* WaProgramMiArbOnOffAroundMiSetContext:ivb,vlv,hsw */ if (IS_GEN7(ring->dev)) intel_ring_emit(ring, MI_ARB_ON_OFF | MI_ARB_DISABLE); else intel_ring_emit(ring, MI_NOOP); intel_ring_emit(ring, MI_NOOP); intel_ring_emit(ring, MI_SET_CONTEXT); intel_ring_emit(ring, new_context->obj->gtt_offset | MI_MM_SPACE_GTT | MI_SAVE_EXT_STATE_EN | MI_RESTORE_EXT_STATE_EN | hw_flags); /* w/a: MI_SET_CONTEXT must always be followed by MI_NOOP */ intel_ring_emit(ring, MI_NOOP); if (IS_GEN7(ring->dev)) intel_ring_emit(ring, MI_ARB_ON_OFF | MI_ARB_ENABLE); else intel_ring_emit(ring, MI_NOOP); intel_ring_advance(ring); return ret; }
static int i915_write_active_seqno(struct intel_ring_buffer *ring, u32 seqno) { int ret; ret = intel_ring_begin(ring, 4); if (ret) return ret; intel_ring_emit(ring, MI_STORE_DWORD_INDEX); intel_ring_emit(ring, I915_GEM_ACTIVE_SEQNO_INDEX << MI_STORE_DWORD_INDEX_SHIFT); intel_ring_emit(ring, seqno); intel_ring_emit(ring, MI_NOOP); intel_ring_advance(ring); return 0; }
static int gen7_render_ring_cs_stall_wa(struct intel_ring_buffer *ring) { int ret; ret = intel_ring_begin(ring, 4); if (ret) return ret; intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(4)); intel_ring_emit(ring, PIPE_CONTROL_CS_STALL | PIPE_CONTROL_STALL_AT_SCOREBOARD); intel_ring_emit(ring, 0); intel_ring_emit(ring, 0); intel_ring_advance(ring); return 0; }
/* Broadwell Page Directory Pointer Descriptors */ static int gen8_write_pdp(struct intel_ring_buffer *ring, unsigned entry, uint64_t val) { int ret; BUG_ON(entry >= 4); ret = intel_ring_begin(ring, 6); if (ret) return ret; intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1)); intel_ring_emit(ring, GEN8_RING_PDP_UDW(ring, entry)); intel_ring_emit(ring, (u32)(val >> 32)); intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1)); intel_ring_emit(ring, GEN8_RING_PDP_LDW(ring, entry)); intel_ring_emit(ring, (u32)(val)); intel_ring_advance(ring); return 0; }
static int render_ring_flush(struct intel_ring_buffer *ring, u32 invalidate_domains, u32 flush_domains) { struct drm_device *dev = ring->dev; u32 cmd; int ret; /* * read/write caches: * * I915_GEM_DOMAIN_RENDER is always invalidated, but is * only flushed if MI_NO_WRITE_FLUSH is unset. On 965, it is * also flushed at 2d versus 3d pipeline switches. * * read-only caches: * * I915_GEM_DOMAIN_SAMPLER is flushed on pre-965 if * MI_READ_FLUSH is set, and is always flushed on 965. * * I915_GEM_DOMAIN_COMMAND may not exist? * * I915_GEM_DOMAIN_INSTRUCTION, which exists on 965, is * invalidated when MI_EXE_FLUSH is set. * * I915_GEM_DOMAIN_VERTEX, which exists on 965, is * invalidated with every MI_FLUSH. * * TLBs: * * On 965, TLBs associated with I915_GEM_DOMAIN_COMMAND * and I915_GEM_DOMAIN_CPU in are invalidated at PTE write and * I915_GEM_DOMAIN_RENDER and I915_GEM_DOMAIN_SAMPLER * are flushed at any MI_FLUSH. */ cmd = MI_FLUSH | MI_NO_WRITE_FLUSH; if ((invalidate_domains|flush_domains) & I915_GEM_DOMAIN_RENDER) cmd &= ~MI_NO_WRITE_FLUSH; if (INTEL_INFO(dev)->gen < 4) { /* * On the 965, the sampler cache always gets flushed * and this bit is reserved. */ if (invalidate_domains & I915_GEM_DOMAIN_SAMPLER) cmd |= MI_READ_FLUSH; } if (invalidate_domains & I915_GEM_DOMAIN_INSTRUCTION) cmd |= MI_EXE_FLUSH; if (invalidate_domains & I915_GEM_DOMAIN_COMMAND && (IS_G4X(dev) || IS_GEN5(dev))) cmd |= MI_INVALIDATE_ISP; ret = intel_ring_begin(ring, 2); if (ret) return ret; intel_ring_emit(ring, cmd); intel_ring_emit(ring, MI_NOOP); intel_ring_advance(ring); return 0; }
static inline int mi_set_context(struct intel_ring_buffer *ring, struct i915_hw_context *new_context, u32 hw_flags) { int ret; u32 flags = 0; u32 scratch_addr = get_pipe_control_scratch_addr(ring) + 128; /* w/a: If Flush TLB Invalidation Mode is enabled, driver must do a TLB * invalidation prior to MI_SET_CONTEXT. On GEN6 we don't set the value * explicitly, so we rely on the value at ring init, stored in * itlb_before_ctx_switch. */ if (IS_GEN6(ring->dev) && ring->itlb_before_ctx_switch) { ret = ring->flush(ring, 0, 0); if (ret) return ret; } if (IS_GEN7(ring->dev)) ret = intel_ring_begin(ring, 6+4+7); else ret = intel_ring_begin(ring, 6); if (ret) return ret; if (IS_GEN7(ring->dev)) intel_ring_emit(ring, MI_ARB_ON_OFF | MI_ARB_DISABLE); else intel_ring_emit(ring, MI_NOOP); intel_ring_emit(ring, MI_NOOP); intel_ring_emit(ring, MI_SET_CONTEXT); intel_ring_emit(ring, new_context->obj->gtt_offset | MI_MM_SPACE_GTT | MI_SAVE_EXT_STATE_EN | MI_RESTORE_EXT_STATE_EN | hw_flags); /* w/a: MI_SET_CONTEXT must always be followed by MI_NOOP */ intel_ring_emit(ring, MI_NOOP); if (IS_GEN7(ring->dev)) { /* WaSendDummy3dPrimitveAfterSetContext */ /* Software must send a pipe_control with a CS stall and a post sync operation and then a dummy DRAW after every MI_SET_CONTEXT and after any PIPELINE_SELECT that is enabling 3D mode. A dummy draw is a 3DPRIMITIVE command with Indirect Parameter Enable set to 0, UAV Coherency Required set to 0, Predicate Enable set to 0, End Offset Enable set to 0, and Vertex Count Per Instance set to 0, All other parameters are a don't care */ /* Send pipe control with CS Stall and postsync op before 3D_PRIMITIVE */ flags |= PIPE_CONTROL_QW_WRITE | PIPE_CONTROL_CS_STALL; intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(4)); intel_ring_emit(ring, flags); intel_ring_emit(ring, scratch_addr | PIPE_CONTROL_GLOBAL_GTT); intel_ring_emit(ring, 0); /* Send a dummy 3D_PRIMITVE */ intel_ring_emit(ring, GFX_OP_3DPRIMITIVE()); intel_ring_emit(ring, 4); /* PrimTopoType*/ intel_ring_emit(ring, 0); /* VertexCountPerInstance */ intel_ring_emit(ring, 0); /* StartVertexLocation */ intel_ring_emit(ring, 0); /* InstanceCount */ intel_ring_emit(ring, 0); /* StartInstanceLocation */ intel_ring_emit(ring, 0); /* BaseVertexLocation */ } if (IS_GEN7(ring->dev)) intel_ring_emit(ring, MI_ARB_ON_OFF | MI_ARB_ENABLE); else intel_ring_emit(ring, MI_NOOP); intel_ring_advance(ring); i915_add_request_noflush(ring); return ret; }
static int gen7_render_ring_flush(struct intel_ring_buffer *ring, u32 invalidate_domains, u32 flush_domains) { u32 flags = 0; struct pipe_control *pc = ring->private; u32 scratch_addr = pc->gtt_offset + 128; int ret; /* * Ensure that any following seqno writes only happen when the render * cache is indeed flushed. * * Workaround: 4th PIPE_CONTROL command (except the ones with only * read-cache invalidate bits set) must have the CS_STALL bit set. We * don't try to be clever and just set it unconditionally. */ flags |= PIPE_CONTROL_CS_STALL; /* Just flush everything. Experiments have shown that reducing the * number of bits based on the write domains has little performance * impact. */ if (flush_domains) { flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH; flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH; } if (invalidate_domains) { flags |= PIPE_CONTROL_TLB_INVALIDATE; flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE; flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE; flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE; flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE; flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE; flags |= PIPE_CONTROL_MEDIA_STATE_CLEAR; /* * TLB invalidate requires a post-sync write. */ flags |= PIPE_CONTROL_QW_WRITE; flags |= PIPE_CONTROL_GLOBAL_GTT_IVB; flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD; /* Workaround: we must issue a pipe_control with CS-stall bit * set before a pipe_control command that has the state cache * invalidate bit set. */ gen7_render_ring_cs_stall_wa(ring); } ret = intel_ring_begin(ring, 4); if (ret) return ret; intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(4)); intel_ring_emit(ring, flags); intel_ring_emit(ring, scratch_addr); intel_ring_emit(ring, 0); intel_ring_advance(ring); return 0; }