void intel_blt_copy(struct intel_batchbuffer *batch, drm_intel_bo *src_bo, int src_x1, int src_y1, int src_pitch, drm_intel_bo *dst_bo, int dst_x1, int dst_y1, int dst_pitch, int width, int height, int bpp) { uint32_t src_tiling, dst_tiling, swizzle; uint32_t cmd_bits = 0; uint32_t br13_bits; drm_intel_bo_get_tiling(src_bo, &src_tiling, &swizzle); drm_intel_bo_get_tiling(dst_bo, &dst_tiling, &swizzle); if (IS_965(batch->devid) && src_tiling != I915_TILING_NONE) { src_pitch /= 4; cmd_bits |= XY_SRC_COPY_BLT_SRC_TILED; } if (IS_965(batch->devid) && dst_tiling != I915_TILING_NONE) { dst_pitch /= 4; cmd_bits |= XY_SRC_COPY_BLT_DST_TILED; } br13_bits = 0; switch (bpp) { case 8: break; case 16: /* supporting only RGB565, not ARGB1555 */ br13_bits |= 1 << 24; break; case 32: br13_bits |= 3 << 24; cmd_bits |= XY_SRC_COPY_BLT_WRITE_ALPHA | XY_SRC_COPY_BLT_WRITE_RGB; break; default: abort(); } #define CHECK_RANGE(x) ((x) >= 0 && (x) < (1 << 15)) assert(CHECK_RANGE(src_x1) && CHECK_RANGE(src_y1) && CHECK_RANGE(dst_x1) && CHECK_RANGE(dst_y1) && CHECK_RANGE(width) && CHECK_RANGE(height) && CHECK_RANGE(src_x1 + width) && CHECK_RANGE(src_y1 + height) && CHECK_RANGE(dst_x1 + width) && CHECK_RANGE(dst_y1 + height) && CHECK_RANGE(src_pitch) && CHECK_RANGE(dst_pitch)); #undef CHECK_RANGE BEGIN_BATCH(8); OUT_BATCH(XY_SRC_COPY_BLT_CMD | cmd_bits); OUT_BATCH((br13_bits) | (0xcc << 16) | /* copy ROP */ dst_pitch); OUT_BATCH((dst_y1 << 16) | dst_x1); /* dst x1,y1 */ OUT_BATCH(((dst_y1 + height) << 16) | (dst_x1 + width)); /* dst x2,y2 */ OUT_RELOC(dst_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0); OUT_BATCH((src_y1 << 16) | src_x1); /* src x1,y1 */ OUT_BATCH(src_pitch); OUT_RELOC(src_bo, I915_GEM_DOMAIN_RENDER, 0, 0); ADVANCE_BATCH(); intel_batchbuffer_flush(batch); }
static void upload_ps_state(struct brw_context *brw) { struct intel_context *intel = &brw->intel; struct gl_context *ctx = &intel->ctx; uint32_t dw2, dw4, dw5; const int max_threads_shift = brw->intel.is_haswell ? HSW_PS_MAX_THREADS_SHIFT : IVB_PS_MAX_THREADS_SHIFT; /* BRW_NEW_PS_BINDING_TABLE */ BEGIN_BATCH(2); OUT_BATCH(_3DSTATE_BINDING_TABLE_POINTERS_PS << 16 | (2 - 2)); OUT_BATCH(brw->wm.bind_bo_offset); ADVANCE_BATCH(); /* CACHE_NEW_SAMPLER */ BEGIN_BATCH(2); OUT_BATCH(_3DSTATE_SAMPLER_STATE_POINTERS_PS << 16 | (2 - 2)); OUT_BATCH(brw->sampler.offset); ADVANCE_BATCH(); /* CACHE_NEW_WM_PROG */ if (brw->wm.prog_data->nr_params == 0) { /* Disable the push constant buffers. */ BEGIN_BATCH(7); OUT_BATCH(_3DSTATE_CONSTANT_PS << 16 | (7 - 2)); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } else { BEGIN_BATCH(7); OUT_BATCH(_3DSTATE_CONSTANT_PS << 16 | (7 - 2)); OUT_BATCH(ALIGN(brw->wm.prog_data->nr_params, brw->wm.prog_data->dispatch_width) / 8); OUT_BATCH(0); /* Pointer to the WM constant buffer. Covered by the set of * state flags from gen6_upload_wm_push_constants. */ OUT_BATCH(brw->wm.push_const_offset); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } dw2 = dw4 = dw5 = 0; /* CACHE_NEW_SAMPLER */ dw2 |= (ALIGN(brw->sampler.count, 4) / 4) << GEN7_PS_SAMPLER_COUNT_SHIFT; /* Use ALT floating point mode for ARB fragment programs, because they * require 0^0 == 1. Even though _CurrentFragmentProgram is used for * rendering, CurrentFragmentProgram is used for this check to * differentiate between the GLSL and non-GLSL cases. */ if (intel->ctx.Shader.CurrentFragmentProgram == NULL) dw2 |= GEN7_PS_FLOATING_POINT_MODE_ALT; if (intel->is_haswell) dw4 |= SET_FIELD(1, HSW_PS_SAMPLE_MASK); /* 1 sample for now */ dw4 |= (brw->max_wm_threads - 1) << max_threads_shift; /* CACHE_NEW_WM_PROG */ if (brw->wm.prog_data->nr_params > 0) dw4 |= GEN7_PS_PUSH_CONSTANT_ENABLE; /* CACHE_NEW_WM_PROG | _NEW_COLOR * * The hardware wedges if you have this bit set but don't turn on any dual * source blend factors. */ if (brw->wm.prog_data->dual_src_blend && (ctx->Color.BlendEnabled & 1) && ctx->Color.Blend[0]._UsesDualSrc) { dw4 |= GEN7_PS_DUAL_SOURCE_BLEND_ENABLE; } /* BRW_NEW_FRAGMENT_PROGRAM */ if (brw->fragment_program->Base.InputsRead != 0) dw4 |= GEN7_PS_ATTRIBUTE_ENABLE; dw4 |= GEN7_PS_8_DISPATCH_ENABLE; if (brw->wm.prog_data->prog_offset_16) dw4 |= GEN7_PS_16_DISPATCH_ENABLE; dw5 |= (brw->wm.prog_data->first_curbe_grf << GEN7_PS_DISPATCH_START_GRF_SHIFT_0); dw5 |= (brw->wm.prog_data->first_curbe_grf_16 << GEN7_PS_DISPATCH_START_GRF_SHIFT_2); BEGIN_BATCH(8); OUT_BATCH(_3DSTATE_PS << 16 | (8 - 2)); OUT_BATCH(brw->wm.prog_offset); OUT_BATCH(dw2); if (brw->wm.prog_data->total_scratch) { OUT_RELOC(brw->wm.scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, ffs(brw->wm.prog_data->total_scratch) - 11); } else { OUT_BATCH(0); } OUT_BATCH(dw4); OUT_BATCH(dw5); OUT_BATCH(0); /* kernel 1 pointer */ OUT_BATCH(brw->wm.prog_offset + brw->wm.prog_data->prog_offset_16); ADVANCE_BATCH(); }
static void inline emit_tx_setup(struct r200_context *r200, mesa_format src_mesa_format, mesa_format dst_mesa_format, struct radeon_bo *bo, intptr_t offset, unsigned width, unsigned height, unsigned pitch) { uint32_t txformat = R200_TXFORMAT_NON_POWER2; BATCH_LOCALS(&r200->radeon); assert(width <= 2048); assert(height <= 2048); assert(offset % 32 == 0); /* XXX others? BE/LE? */ switch (src_mesa_format) { case MESA_FORMAT_B8G8R8A8_UNORM: txformat |= R200_TXFORMAT_ARGB8888 | R200_TXFORMAT_ALPHA_IN_MAP; break; case MESA_FORMAT_A8B8G8R8_UNORM: txformat |= R200_TXFORMAT_RGBA8888 | R200_TXFORMAT_ALPHA_IN_MAP; break; case MESA_FORMAT_R8G8B8A8_UNORM: txformat |= R200_TXFORMAT_ABGR8888 | R200_TXFORMAT_ALPHA_IN_MAP; break; case MESA_FORMAT_B8G8R8X8_UNORM: txformat |= R200_TXFORMAT_ARGB8888; break; case MESA_FORMAT_B5G6R5_UNORM: txformat |= R200_TXFORMAT_RGB565; break; case MESA_FORMAT_B4G4R4A4_UNORM: txformat |= R200_TXFORMAT_ARGB4444 | R200_TXFORMAT_ALPHA_IN_MAP; break; case MESA_FORMAT_B5G5R5A1_UNORM: txformat |= R200_TXFORMAT_ARGB1555 | R200_TXFORMAT_ALPHA_IN_MAP; break; case MESA_FORMAT_A_UNORM8: case MESA_FORMAT_I_UNORM8: txformat |= R200_TXFORMAT_I8 | R200_TXFORMAT_ALPHA_IN_MAP; break; case MESA_FORMAT_L_UNORM8: txformat |= R200_TXFORMAT_I8; break; case MESA_FORMAT_L8A8_UNORM: txformat |= R200_TXFORMAT_AI88 | R200_TXFORMAT_ALPHA_IN_MAP; break; default: break; } if (bo->flags & RADEON_BO_FLAGS_MACRO_TILE) offset |= R200_TXO_MACRO_TILE; if (bo->flags & RADEON_BO_FLAGS_MICRO_TILE) offset |= R200_TXO_MICRO_TILE; switch (dst_mesa_format) { case MESA_FORMAT_B8G8R8A8_UNORM: case MESA_FORMAT_B8G8R8X8_UNORM: case MESA_FORMAT_B5G6R5_UNORM: case MESA_FORMAT_B4G4R4A4_UNORM: case MESA_FORMAT_B5G5R5A1_UNORM: case MESA_FORMAT_A_UNORM8: case MESA_FORMAT_L_UNORM8: case MESA_FORMAT_I_UNORM8: default: /* no swizzle required */ BEGIN_BATCH(10); OUT_BATCH_REGVAL(RADEON_PP_CNTL, (RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE)); OUT_BATCH_REGVAL(R200_PP_TXCBLEND_0, (R200_TXC_ARG_A_ZERO | R200_TXC_ARG_B_ZERO | R200_TXC_ARG_C_R0_COLOR | R200_TXC_OP_MADD)); OUT_BATCH_REGVAL(R200_PP_TXCBLEND2_0, (R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0)); OUT_BATCH_REGVAL(R200_PP_TXABLEND_0, (R200_TXA_ARG_A_ZERO | R200_TXA_ARG_B_ZERO | R200_TXA_ARG_C_R0_ALPHA | R200_TXA_OP_MADD)); OUT_BATCH_REGVAL(R200_PP_TXABLEND2_0, (R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0)); END_BATCH(); break; case MESA_FORMAT_A8B8G8R8_UNORM: BEGIN_BATCH(10); OUT_BATCH_REGVAL(RADEON_PP_CNTL, (RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE)); OUT_BATCH_REGVAL(R200_PP_TXCBLEND_0, (R200_TXC_ARG_A_ZERO | R200_TXC_ARG_B_ZERO | R200_TXC_ARG_C_R0_COLOR | R200_TXC_OP_MADD)); OUT_BATCH_REGVAL(R200_PP_TXCBLEND2_0, (R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_ROTATE_GBA | R200_TXC_OUTPUT_REG_R0)); OUT_BATCH_REGVAL(R200_PP_TXABLEND_0, (R200_TXA_ARG_A_ZERO | R200_TXA_ARG_B_ZERO | R200_TXA_ARG_C_R0_ALPHA | R200_TXA_OP_MADD)); OUT_BATCH_REGVAL(R200_PP_TXABLEND2_0, (R200_TXA_CLAMP_0_1 | (R200_TXA_REPL_RED << R200_TXA_REPL_ARG_C_SHIFT) | R200_TXA_OUTPUT_REG_R0)); END_BATCH(); break; case MESA_FORMAT_R8G8B8A8_UNORM: BEGIN_BATCH(34); OUT_BATCH_REGVAL(RADEON_PP_CNTL, (RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE | RADEON_TEX_BLEND_1_ENABLE | RADEON_TEX_BLEND_2_ENABLE | RADEON_TEX_BLEND_3_ENABLE)); /* r1.r = r0.b */ OUT_BATCH_REGVAL(R200_PP_TXCBLEND_0, (R200_TXC_ARG_A_ZERO | R200_TXC_ARG_B_ZERO | R200_TXC_ARG_C_R0_COLOR | R200_TXC_OP_MADD)); OUT_BATCH_REGVAL(R200_PP_TXCBLEND2_0, (R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_MASK_R | (R200_TXC_REPL_BLUE << R200_TXC_REPL_ARG_C_SHIFT) | R200_TXC_OUTPUT_REG_R1)); /* r1.a = r0.a */ OUT_BATCH_REGVAL(R200_PP_TXABLEND_0, (R200_TXA_ARG_A_ZERO | R200_TXA_ARG_B_ZERO | R200_TXA_ARG_C_R0_ALPHA | R200_TXA_OP_MADD)); OUT_BATCH_REGVAL(R200_PP_TXABLEND2_0, (R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R1)); /* r1.g = r0.g */ OUT_BATCH_REGVAL(R200_PP_TXCBLEND_1, (R200_TXC_ARG_A_ZERO | R200_TXC_ARG_B_ZERO | R200_TXC_ARG_C_R0_COLOR | R200_TXC_OP_MADD)); OUT_BATCH_REGVAL(R200_PP_TXCBLEND2_1, (R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_MASK_G | (R200_TXC_REPL_GREEN << R200_TXC_REPL_ARG_C_SHIFT) | R200_TXC_OUTPUT_REG_R1)); /* r1.a = r0.a */ OUT_BATCH_REGVAL(R200_PP_TXABLEND_1, (R200_TXA_ARG_A_ZERO | R200_TXA_ARG_B_ZERO | R200_TXA_ARG_C_R0_ALPHA | R200_TXA_OP_MADD)); OUT_BATCH_REGVAL(R200_PP_TXABLEND2_1, (R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R1)); /* r1.b = r0.r */ OUT_BATCH_REGVAL(R200_PP_TXCBLEND_2, (R200_TXC_ARG_A_ZERO | R200_TXC_ARG_B_ZERO | R200_TXC_ARG_C_R0_COLOR | R200_TXC_OP_MADD)); OUT_BATCH_REGVAL(R200_PP_TXCBLEND2_2, (R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_MASK_B | (R200_TXC_REPL_RED << R200_TXC_REPL_ARG_C_SHIFT) | R200_TXC_OUTPUT_REG_R1)); /* r1.a = r0.a */ OUT_BATCH_REGVAL(R200_PP_TXABLEND_2, (R200_TXA_ARG_A_ZERO | R200_TXA_ARG_B_ZERO | R200_TXA_ARG_C_R0_ALPHA | R200_TXA_OP_MADD)); OUT_BATCH_REGVAL(R200_PP_TXABLEND2_2, (R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R1)); /* r0.rgb = r1.rgb */ OUT_BATCH_REGVAL(R200_PP_TXCBLEND_3, (R200_TXC_ARG_A_ZERO | R200_TXC_ARG_B_ZERO | R200_TXC_ARG_C_R1_COLOR | R200_TXC_OP_MADD)); OUT_BATCH_REGVAL(R200_PP_TXCBLEND2_3, (R200_TXC_CLAMP_0_1 | R200_TXC_OUTPUT_REG_R0)); /* r0.a = r1.a */ OUT_BATCH_REGVAL(R200_PP_TXABLEND_3, (R200_TXA_ARG_A_ZERO | R200_TXA_ARG_B_ZERO | R200_TXA_ARG_C_R1_ALPHA | R200_TXA_OP_MADD)); OUT_BATCH_REGVAL(R200_PP_TXABLEND2_3, (R200_TXA_CLAMP_0_1 | R200_TXA_OUTPUT_REG_R0)); END_BATCH(); break; } BEGIN_BATCH(18); OUT_BATCH_REGVAL(R200_PP_CNTL_X, 0); OUT_BATCH_REGVAL(R200_PP_TXMULTI_CTL_0, 0); OUT_BATCH_REGVAL(R200_PP_TXFILTER_0, (R200_CLAMP_S_CLAMP_LAST | R200_CLAMP_T_CLAMP_LAST | R200_MAG_FILTER_NEAREST | R200_MIN_FILTER_NEAREST)); OUT_BATCH_REGVAL(R200_PP_TXFORMAT_0, txformat); OUT_BATCH_REGVAL(R200_PP_TXFORMAT_X_0, 0); OUT_BATCH_REGVAL(R200_PP_TXSIZE_0, ((width - 1) | ((height - 1) << RADEON_TEX_VSIZE_SHIFT))); OUT_BATCH_REGVAL(R200_PP_TXPITCH_0, pitch * _mesa_get_format_bytes(src_mesa_format) - 32); OUT_BATCH_REGSEQ(R200_PP_TXOFFSET_0, 1); OUT_BATCH_RELOC(offset, bo, offset, RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0); END_BATCH(); }
void intelEmitImmediateColorExpandBlit(struct intel_context *intel, GLuint cpp, GLubyte *src_bits, GLuint src_size, GLuint fg_color, GLshort dst_pitch, dri_bo *dst_buffer, GLuint dst_offset, GLboolean dst_tiled, GLshort x, GLshort y, GLshort w, GLshort h, GLenum logic_op) { int dwords = ALIGN(src_size, 8) / 4; uint32_t opcode, br13, blit_cmd; assert( logic_op - GL_CLEAR >= 0 ); assert( logic_op - GL_CLEAR < 0x10 ); if (w < 0 || h < 0) return; dst_pitch *= cpp; DBG("%s dst:buf(%p)/%d+%d %d,%d sz:%dx%d, %d bytes %d dwords\n", __FUNCTION__, dst_buffer, dst_pitch, dst_offset, x, y, w, h, src_size, dwords); intel_batchbuffer_require_space( intel->batch, (8 * 4) + (3 * 4) + dwords, NO_LOOP_CLIPRECTS ); opcode = XY_SETUP_BLT_CMD; if (cpp == 4) opcode |= XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB; #ifndef I915 if (dst_tiled) { opcode |= XY_DST_TILED; dst_pitch /= 4; } #endif br13 = dst_pitch | (translate_raster_op(logic_op) << 16) | (1 << 29); if (cpp == 2) br13 |= BR13_565; else br13 |= BR13_8888; blit_cmd = XY_TEXT_IMMEDIATE_BLIT_CMD | XY_TEXT_BYTE_PACKED; /* packing? */ if (dst_tiled) blit_cmd |= XY_DST_TILED; BEGIN_BATCH(8 + 3, NO_LOOP_CLIPRECTS); OUT_BATCH(opcode); OUT_BATCH(br13); OUT_BATCH((0 << 16) | 0); /* clip x1, y1 */ OUT_BATCH((100 << 16) | 100); /* clip x2, y2 */ OUT_RELOC(dst_buffer, DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_WRITE, dst_offset); OUT_BATCH(0); /* bg */ OUT_BATCH(fg_color); /* fg */ OUT_BATCH(0); /* pattern base addr */ OUT_BATCH(blit_cmd | ((3 - 2) + dwords)); OUT_BATCH((y << 16) | x); OUT_BATCH(((y + h) << 16) | (x + w)); ADVANCE_BATCH(); intel_batchbuffer_data( intel->batch, src_bits, dwords * 4, NO_LOOP_CLIPRECTS ); }
void r200EmitAOS(r200ContextPtr rmesa, GLuint nr, GLuint offset) { BATCH_LOCALS(&rmesa->radeon); uint32_t voffset; int sz = 1 + (nr >> 1) * 3 + (nr & 1) * 2; int i; radeon_print(RADEON_RENDER, RADEON_VERBOSE, "%s: nr=%d, ofs=0x%08x\n", __FUNCTION__, nr, offset); BEGIN_BATCH(sz+2+ (nr*2)); OUT_BATCH_PACKET3(R200_CP_CMD_3D_LOAD_VBPNTR, sz - 1); OUT_BATCH(nr); { for (i = 0; i + 1 < nr; i += 2) { OUT_BATCH((rmesa->radeon.tcl.aos[i].components << 0) | (rmesa->radeon.tcl.aos[i].stride << 8) | (rmesa->radeon.tcl.aos[i + 1].components << 16) | (rmesa->radeon.tcl.aos[i + 1].stride << 24)); voffset = rmesa->radeon.tcl.aos[i + 0].offset + offset * 4 * rmesa->radeon.tcl.aos[i + 0].stride; OUT_BATCH(voffset); voffset = rmesa->radeon.tcl.aos[i + 1].offset + offset * 4 * rmesa->radeon.tcl.aos[i + 1].stride; OUT_BATCH(voffset); } if (nr & 1) { OUT_BATCH((rmesa->radeon.tcl.aos[nr - 1].components << 0) | (rmesa->radeon.tcl.aos[nr - 1].stride << 8)); voffset = rmesa->radeon.tcl.aos[nr - 1].offset + offset * 4 * rmesa->radeon.tcl.aos[nr - 1].stride; OUT_BATCH(voffset); } for (i = 0; i + 1 < nr; i += 2) { voffset = rmesa->radeon.tcl.aos[i + 0].offset + offset * 4 * rmesa->radeon.tcl.aos[i + 0].stride; radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs, rmesa->radeon.tcl.aos[i+0].bo, RADEON_GEM_DOMAIN_GTT, 0, 0); voffset = rmesa->radeon.tcl.aos[i + 1].offset + offset * 4 * rmesa->radeon.tcl.aos[i + 1].stride; radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs, rmesa->radeon.tcl.aos[i+1].bo, RADEON_GEM_DOMAIN_GTT, 0, 0); } if (nr & 1) { voffset = rmesa->radeon.tcl.aos[nr - 1].offset + offset * 4 * rmesa->radeon.tcl.aos[nr - 1].stride; radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs, rmesa->radeon.tcl.aos[nr-1].bo, RADEON_GEM_DOMAIN_GTT, 0, 0); } } END_BATCH(); }
static void upload_clip_state(struct brw_context *brw) { struct intel_context *intel = &brw->intel; struct gl_context *ctx = &intel->ctx; uint32_t depth_clamp = 0; uint32_t provoking, userclip; uint32_t dw1 = GEN6_CLIP_STATISTICS_ENABLE; uint32_t nonperspective_barycentric_enable_flag = 0; /* _NEW_BUFFERS */ bool render_to_fbo = _mesa_is_user_fbo(brw->intel.ctx.DrawBuffer); /* CACHE_NEW_WM_PROG */ if (brw->wm.prog_data->barycentric_interp_modes & (1 << BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC)) { nonperspective_barycentric_enable_flag = GEN6_CLIP_NON_PERSPECTIVE_BARYCENTRIC_ENABLE; } dw1 |= GEN7_CLIP_EARLY_CULL; /* _NEW_POLYGON */ if ((ctx->Polygon.FrontFace == GL_CCW) ^ render_to_fbo) dw1 |= GEN7_CLIP_WINDING_CCW; if (ctx->Polygon.CullFlag) { switch (ctx->Polygon.CullFaceMode) { case GL_FRONT: dw1 |= GEN7_CLIP_CULLMODE_FRONT; break; case GL_BACK: dw1 |= GEN7_CLIP_CULLMODE_BACK; break; case GL_FRONT_AND_BACK: dw1 |= GEN7_CLIP_CULLMODE_BOTH; break; default: assert(!"Should not get here: invalid CullFlag"); break; } } else { dw1 |= GEN7_CLIP_CULLMODE_NONE; } /* _NEW_TRANSFORM */ if (!ctx->Transform.DepthClamp) depth_clamp = GEN6_CLIP_Z_TEST; /* _NEW_LIGHT */ if (ctx->Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION) { provoking = (0 << GEN6_CLIP_TRI_PROVOKE_SHIFT) | (1 << GEN6_CLIP_TRIFAN_PROVOKE_SHIFT) | (0 << GEN6_CLIP_LINE_PROVOKE_SHIFT); } else { provoking = (2 << GEN6_CLIP_TRI_PROVOKE_SHIFT) | (2 << GEN6_CLIP_TRIFAN_PROVOKE_SHIFT) | (1 << GEN6_CLIP_LINE_PROVOKE_SHIFT); } /* _NEW_TRANSFORM */ userclip = ctx->Transform.ClipPlanesEnabled; BEGIN_BATCH(4); OUT_BATCH(_3DSTATE_CLIP << 16 | (4 - 2)); OUT_BATCH(dw1); OUT_BATCH(GEN6_CLIP_ENABLE | GEN6_CLIP_API_OGL | GEN6_CLIP_MODE_NORMAL | nonperspective_barycentric_enable_flag | GEN6_CLIP_XY_TEST | GEN6_CLIP_GB_TEST | userclip << GEN6_USER_CLIP_CLIP_DISTANCES_SHIFT | depth_clamp | provoking); OUT_BATCH(U_FIXED(0.125, 3) << GEN6_CLIP_MIN_POINT_WIDTH_SHIFT | U_FIXED(255.875, 3) << GEN6_CLIP_MAX_POINT_WIDTH_SHIFT | GEN6_CLIP_FORCE_ZERO_RTAINDEX); ADVANCE_BATCH(); }
/** * Use blitting to clear the renderbuffers named by 'flags'. * Note: we can't use the ctx->DrawBuffer->_ColorDrawBufferIndexes field * since that might include software renderbuffers or renderbuffers * which we're clearing with triangles. * \param mask bitmask of BUFFER_BIT_* values indicating buffers to clear */ void intelClearWithBlit(GLcontext *ctx, GLbitfield mask) { struct intel_context *intel = intel_context(ctx); struct gl_framebuffer *fb = ctx->DrawBuffer; GLuint clear_depth; GLbitfield skipBuffers = 0; BATCH_LOCALS; /* * Compute values for clearing the buffers. */ clear_depth = 0; if (mask & BUFFER_BIT_DEPTH) { clear_depth = (GLuint) (fb->_DepthMax * ctx->Depth.Clear); } if (mask & BUFFER_BIT_STENCIL) { clear_depth |= (ctx->Stencil.Clear & 0xff) << 24; } /* If clearing both depth and stencil, skip BUFFER_BIT_STENCIL in * the loop below. */ if ((mask & BUFFER_BIT_DEPTH) && (mask & BUFFER_BIT_STENCIL)) { skipBuffers = BUFFER_BIT_STENCIL; } /* XXX Move this flush/lock into the following conditional? */ intelFlush(&intel->ctx); LOCK_HARDWARE(intel); if (intel->numClipRects) { GLint cx, cy, cw, ch; drm_clip_rect_t clear; int i; /* Get clear bounds after locking */ cx = fb->_Xmin; cy = fb->_Ymin; cw = fb->_Xmax - cx; ch = fb->_Ymax - cy; if (fb->Name == 0) { /* clearing a window */ /* flip top to bottom */ clear.x1 = cx + intel->drawX; clear.y1 = intel->driDrawable->y + intel->driDrawable->h - cy - ch; clear.x2 = clear.x1 + cw; clear.y2 = clear.y1 + ch; } else { /* clearing FBO */ assert(intel->numClipRects == 1); assert(intel->pClipRects == &intel->fboRect); clear.x1 = cx; clear.y1 = cy; clear.x2 = clear.x1 + cw; clear.y2 = clear.y1 + ch; /* no change to mask */ } for (i = 0; i < intel->numClipRects; i++) { const drm_clip_rect_t *box = &intel->pClipRects[i]; drm_clip_rect_t b; GLuint buf; GLuint clearMask = mask; /* use copy, since we modify it below */ GLboolean all = (cw == fb->Width && ch == fb->Height); if (!all) { intel_intersect_cliprects(&b, &clear, box); } else { b = *box; } if (b.x1 >= b.x2 || b.y1 >= b.y2) continue; if (0) _mesa_printf("clear %d,%d..%d,%d, mask %x\n", b.x1, b.y1, b.x2, b.y2, mask); /* Loop over all renderbuffers */ for (buf = 0; buf < BUFFER_COUNT && clearMask; buf++) { const GLbitfield bufBit = 1 << buf; if ((clearMask & bufBit) && !(bufBit & skipBuffers)) { /* OK, clear this renderbuffer */ struct intel_region *irb_region = intel_get_rb_region(fb, buf); dri_bo *write_buffer = intel_region_buffer(intel, irb_region, all ? INTEL_WRITE_FULL : INTEL_WRITE_PART); GLuint clearVal; GLint pitch, cpp; GLuint BR13, CMD; ASSERT(irb_region); pitch = irb_region->pitch; cpp = irb_region->cpp; DBG("%s dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n", __FUNCTION__, irb_region->buffer, (pitch * cpp), irb_region->draw_offset, b.x1, b.y1, b.x2 - b.x1, b.y2 - b.y1); BR13 = 0xf0 << 16; CMD = XY_COLOR_BLT_CMD; /* Setup the blit command */ if (cpp == 4) { BR13 |= (1 << 24) | (1 << 25); if (buf == BUFFER_DEPTH || buf == BUFFER_STENCIL) { if (clearMask & BUFFER_BIT_DEPTH) CMD |= XY_BLT_WRITE_RGB; if (clearMask & BUFFER_BIT_STENCIL) CMD |= XY_BLT_WRITE_ALPHA; } else { /* clearing RGBA */ CMD |= XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB; } } else { ASSERT(cpp == 2 || cpp == 0); BR13 |= (1 << 24); } #ifndef I915 if (irb_region->tiled) { CMD |= XY_DST_TILED; pitch /= 4; } #endif BR13 |= (pitch * cpp); if (buf == BUFFER_DEPTH || buf == BUFFER_STENCIL) { clearVal = clear_depth; } else { clearVal = (cpp == 4) ? intel->ClearColor8888 : intel->ClearColor565; } /* _mesa_debug(ctx, "hardware blit clear buf %d rb id %d\n", buf, irb->Base.Name); */ intel_wait_flips(intel); assert(b.x1 < b.x2); assert(b.y1 < b.y2); BEGIN_BATCH(6, REFERENCES_CLIPRECTS); OUT_BATCH(CMD); OUT_BATCH(BR13); OUT_BATCH((b.y1 << 16) | b.x1); OUT_BATCH((b.y2 << 16) | b.x2); OUT_RELOC(write_buffer, DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_WRITE, irb_region->draw_offset); OUT_BATCH(clearVal); ADVANCE_BATCH(); clearMask &= ~bufBit; /* turn off bit, for faster loop exit */ } } } intel_batchbuffer_flush(intel->batch); } UNLOCK_HARDWARE(intel); }
static void brw_upload_cs_state(struct brw_context *brw) { if (!brw->cs.prog_data) return; uint32_t offset; uint32_t *desc = (uint32_t*) brw_state_batch(brw, AUB_TRACE_SURFACE_STATE, 8 * 4, 64, &offset); struct brw_stage_state *stage_state = &brw->cs.base; struct brw_cs_prog_data *cs_prog_data = brw->cs.prog_data; struct brw_stage_prog_data *prog_data = &cs_prog_data->base; if (INTEL_DEBUG & DEBUG_SHADER_TIME) { brw->vtbl.emit_buffer_surface_state( brw, &stage_state->surf_offset[ prog_data->binding_table.shader_time_start], brw->shader_time.bo, 0, BRW_SURFACEFORMAT_RAW, brw->shader_time.bo->size, 1, true); } uint32_t *bind = (uint32_t*) brw_state_batch(brw, AUB_TRACE_BINDING_TABLE, prog_data->binding_table.size_bytes, 32, &stage_state->bind_bo_offset); uint32_t dwords = brw->gen < 8 ? 8 : 9; BEGIN_BATCH(dwords); OUT_BATCH(MEDIA_VFE_STATE << 16 | (dwords - 2)); if (prog_data->total_scratch) { if (brw->gen >= 8) OUT_RELOC64(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, ffs(prog_data->total_scratch) - 11); else OUT_RELOC(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, ffs(prog_data->total_scratch) - 11); } else { OUT_BATCH(0); if (brw->gen >= 8) OUT_BATCH(0); } const uint32_t vfe_num_urb_entries = brw->gen >= 8 ? 2 : 0; const uint32_t vfe_gpgpu_mode = brw->gen == 7 ? SET_FIELD(1, GEN7_MEDIA_VFE_STATE_GPGPU_MODE) : 0; OUT_BATCH(SET_FIELD(brw->max_cs_threads - 1, MEDIA_VFE_STATE_MAX_THREADS) | SET_FIELD(vfe_num_urb_entries, MEDIA_VFE_STATE_URB_ENTRIES) | SET_FIELD(1, MEDIA_VFE_STATE_RESET_GTW_TIMER) | SET_FIELD(1, MEDIA_VFE_STATE_BYPASS_GTW) | vfe_gpgpu_mode); OUT_BATCH(0); const uint32_t vfe_urb_allocation = brw->gen >= 8 ? 2 : 0; /* We are uploading duplicated copies of push constant uniforms for each * thread. Although the local id data needs to vary per thread, it won't * change for other uniform data. Unfortunately this duplication is * required for gen7. As of Haswell, this duplication can be avoided, but * this older mechanism with duplicated data continues to work. * * FINISHME: As of Haswell, we could make use of the * INTERFACE_DESCRIPTOR_DATA "Cross-Thread Constant Data Read Length" field * to only store one copy of uniform data. * * FINISHME: Broadwell adds a new alternative "Indirect Payload Storage" * which is described in the GPGPU_WALKER command and in the Broadwell PRM * Volume 7: 3D Media GPGPU, under Media GPGPU Pipeline => Mode of * Operations => GPGPU Mode => Indirect Payload Storage. * * Note: The constant data is built in brw_upload_cs_push_constants below. */ const uint32_t vfe_curbe_allocation = ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads + cs_prog_data->push.cross_thread.regs, 2); OUT_BATCH(SET_FIELD(vfe_urb_allocation, MEDIA_VFE_STATE_URB_ALLOC) | SET_FIELD(vfe_curbe_allocation, MEDIA_VFE_STATE_CURBE_ALLOC)); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); if (cs_prog_data->push.total.size > 0) { BEGIN_BATCH(4); OUT_BATCH(MEDIA_CURBE_LOAD << 16 | (4 - 2)); OUT_BATCH(0); OUT_BATCH(ALIGN(cs_prog_data->push.total.size, 64)); OUT_BATCH(stage_state->push_const_offset); ADVANCE_BATCH(); } /* BRW_NEW_SURFACES and BRW_NEW_*_CONSTBUF */ memcpy(bind, stage_state->surf_offset, prog_data->binding_table.size_bytes); memset(desc, 0, 8 * 4); int dw = 0; desc[dw++] = brw->cs.base.prog_offset; if (brw->gen >= 8) desc[dw++] = 0; /* Kernel Start Pointer High */ desc[dw++] = 0; desc[dw++] = stage_state->sampler_offset | ((stage_state->sampler_count + 3) / 4); desc[dw++] = stage_state->bind_bo_offset; desc[dw++] = SET_FIELD(cs_prog_data->push.per_thread.regs, MEDIA_CURBE_READ_LENGTH); const uint32_t media_threads = brw->gen >= 8 ? SET_FIELD(cs_prog_data->threads, GEN8_MEDIA_GPGPU_THREAD_COUNT) : SET_FIELD(cs_prog_data->threads, MEDIA_GPGPU_THREAD_COUNT); assert(cs_prog_data->threads <= brw->max_cs_threads); assert(prog_data->total_shared <= 64 * 1024); uint32_t slm_size = 0; if (prog_data->total_shared > 0) { /* slm_size is in 4k increments, but must be a power of 2. */ slm_size = 4 * 1024; while (slm_size < prog_data->total_shared) slm_size <<= 1; slm_size /= 4 * 1024; } desc[dw++] = SET_FIELD(cs_prog_data->uses_barrier, MEDIA_BARRIER_ENABLE) | SET_FIELD(slm_size, MEDIA_SHARED_LOCAL_MEMORY_SIZE) | media_threads; desc[dw++] = SET_FIELD(cs_prog_data->push.cross_thread.regs, CROSS_THREAD_READ_LENGTH); BEGIN_BATCH(4); OUT_BATCH(MEDIA_INTERFACE_DESCRIPTOR_LOAD << 16 | (4 - 2)); OUT_BATCH(0); OUT_BATCH(8 * 4); OUT_BATCH(offset); ADVANCE_BATCH(); }
/* Push the state into the sarea and/or texture memory. */ static void i915_emit_state(struct intel_context *intel) { struct i915_context *i915 = i915_context(&intel->ctx); struct i915_hw_state *state = &i915->state; int i, count, aper_count; GLuint dirty; drm_intel_bo *aper_array[3 + I915_TEX_UNITS]; GET_CURRENT_CONTEXT(ctx); BATCH_LOCALS; /* We don't hold the lock at this point, so want to make sure that * there won't be a buffer wrap between the state emits and the primitive * emit header. * * It might be better to talk about explicit places where * scheduling is allowed, rather than assume that it is whenever a * batchbuffer fills up. */ intel_batchbuffer_require_space(intel, get_state_size(state) + INTEL_PRIM_EMIT_SIZE); count = 0; again: if (intel->batch.bo == NULL) { _mesa_error(ctx, GL_OUT_OF_MEMORY, "i915 emit state"); assert(0); } aper_count = 0; dirty = get_dirty(state); aper_array[aper_count++] = intel->batch.bo; if (dirty & I915_UPLOAD_BUFFERS) { if (state->draw_region) aper_array[aper_count++] = state->draw_region->bo; if (state->depth_region) aper_array[aper_count++] = state->depth_region->bo; } if (dirty & I915_UPLOAD_TEX_ALL) { for (i = 0; i < I915_TEX_UNITS; i++) { if (dirty & I915_UPLOAD_TEX(i)) { if (state->tex_buffer[i]) { aper_array[aper_count++] = state->tex_buffer[i]; } } } } if (dri_bufmgr_check_aperture_space(aper_array, aper_count)) { if (count == 0) { count++; intel_batchbuffer_flush(intel); goto again; } else { _mesa_error(ctx, GL_OUT_OF_MEMORY, "i915 emit state"); assert(0); } } /* work out list of buffers to emit */ /* Do this here as we may have flushed the batchbuffer above, * causing more state to be dirty! */ dirty = get_dirty(state); state->emitted |= dirty; assert(get_dirty(state) == 0); if (INTEL_DEBUG & DEBUG_STATE) fprintf(stderr, "%s dirty: %x\n", __func__, dirty); if (dirty & I915_UPLOAD_INVARIENT) { if (INTEL_DEBUG & DEBUG_STATE) fprintf(stderr, "I915_UPLOAD_INVARIENT:\n"); i915_emit_invarient_state(intel); } if (dirty & I915_UPLOAD_RASTER_RULES) { if (INTEL_DEBUG & DEBUG_STATE) fprintf(stderr, "I915_UPLOAD_RASTER_RULES:\n"); emit(intel, state->RasterRules, sizeof(state->RasterRules)); } if (dirty & I915_UPLOAD_CTX) { if (INTEL_DEBUG & DEBUG_STATE) fprintf(stderr, "I915_UPLOAD_CTX:\n"); emit(intel, state->Ctx, sizeof(state->Ctx)); } if (dirty & I915_UPLOAD_BLEND) { if (INTEL_DEBUG & DEBUG_STATE) fprintf(stderr, "I915_UPLOAD_BLEND:\n"); emit(intel, state->Blend, sizeof(state->Blend)); } if (dirty & I915_UPLOAD_BUFFERS) { GLuint count; if (INTEL_DEBUG & DEBUG_STATE) fprintf(stderr, "I915_UPLOAD_BUFFERS:\n"); count = 17; if (state->Buffer[I915_DESTREG_DRAWRECT0] != MI_NOOP) count++; BEGIN_BATCH(count); OUT_BATCH(state->Buffer[I915_DESTREG_CBUFADDR0]); OUT_BATCH(state->Buffer[I915_DESTREG_CBUFADDR1]); if (state->draw_region) { OUT_RELOC(state->draw_region->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0); } else { OUT_BATCH(0); } OUT_BATCH(state->Buffer[I915_DESTREG_DBUFADDR0]); OUT_BATCH(state->Buffer[I915_DESTREG_DBUFADDR1]); if (state->depth_region) { OUT_RELOC(state->depth_region->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0); } else { OUT_BATCH(0); } OUT_BATCH(state->Buffer[I915_DESTREG_DV0]); OUT_BATCH(state->Buffer[I915_DESTREG_DV1]); OUT_BATCH(state->Buffer[I915_DESTREG_SR0]); OUT_BATCH(state->Buffer[I915_DESTREG_SR1]); OUT_BATCH(state->Buffer[I915_DESTREG_SR2]); OUT_BATCH(state->Buffer[I915_DESTREG_SENABLE]); if (state->Buffer[I915_DESTREG_DRAWRECT0] != MI_NOOP) OUT_BATCH(state->Buffer[I915_DESTREG_DRAWRECT0]); OUT_BATCH(state->Buffer[I915_DESTREG_DRAWRECT1]); OUT_BATCH(state->Buffer[I915_DESTREG_DRAWRECT2]); OUT_BATCH(state->Buffer[I915_DESTREG_DRAWRECT3]); OUT_BATCH(state->Buffer[I915_DESTREG_DRAWRECT4]); OUT_BATCH(state->Buffer[I915_DESTREG_DRAWRECT5]); ADVANCE_BATCH(); } if (dirty & I915_UPLOAD_STIPPLE) { if (INTEL_DEBUG & DEBUG_STATE) fprintf(stderr, "I915_UPLOAD_STIPPLE:\n"); emit(intel, state->Stipple, sizeof(state->Stipple)); } /* Combine all the dirty texture state into a single command to * avoid lockups on I915 hardware. */ if (dirty & I915_UPLOAD_TEX_ALL) { int nr = 0; GLuint unwind; for (i = 0; i < I915_TEX_UNITS; i++) if (dirty & I915_UPLOAD_TEX(i)) nr++; BEGIN_BATCH(2 + nr * 3); OUT_BATCH(_3DSTATE_MAP_STATE | (3 * nr)); OUT_BATCH((dirty & I915_UPLOAD_TEX_ALL) >> I915_UPLOAD_TEX_0_SHIFT); for (i = 0; i < I915_TEX_UNITS; i++) if (dirty & I915_UPLOAD_TEX(i)) { OUT_RELOC(state->tex_buffer[i], I915_GEM_DOMAIN_SAMPLER, 0, state->tex_offset[i]); OUT_BATCH(state->Tex[i][I915_TEXREG_MS3]); OUT_BATCH(state->Tex[i][I915_TEXREG_MS4]); } ADVANCE_BATCH(); unwind = intel->batch.used; BEGIN_BATCH(2 + nr * 3); OUT_BATCH(_3DSTATE_SAMPLER_STATE | (3 * nr)); OUT_BATCH((dirty & I915_UPLOAD_TEX_ALL) >> I915_UPLOAD_TEX_0_SHIFT); for (i = 0; i < I915_TEX_UNITS; i++) if (dirty & I915_UPLOAD_TEX(i)) { OUT_BATCH(state->Tex[i][I915_TEXREG_SS2]); OUT_BATCH(state->Tex[i][I915_TEXREG_SS3]); OUT_BATCH(state->Tex[i][I915_TEXREG_SS4]); } ADVANCE_BATCH(); if (i915->last_sampler && memcmp(intel->batch.map + i915->last_sampler, intel->batch.map + unwind, (2 + nr*3)*sizeof(int)) == 0) intel->batch.used = unwind; else i915->last_sampler = unwind; }
static void gen6_blorp_emit_depth_stencil_config(struct brw_context *brw, const brw_blorp_params *params) { struct intel_context *intel = &brw->intel; uint32_t draw_x = params->depth.x_offset; uint32_t draw_y = params->depth.y_offset; uint32_t tile_mask_x, tile_mask_y; gen6_blorp_compute_tile_masks(params, &tile_mask_x, &tile_mask_y); /* 3DSTATE_DEPTH_BUFFER */ { uint32_t tile_x = draw_x & tile_mask_x; uint32_t tile_y = draw_y & tile_mask_y; uint32_t offset = intel_region_get_aligned_offset(params->depth.mt->region, draw_x & ~tile_mask_x, draw_y & ~tile_mask_y, false); /* According to the Sandy Bridge PRM, volume 2 part 1, pp326-327 * (3DSTATE_DEPTH_BUFFER dw5), in the documentation for "Depth * Coordinate Offset X/Y": * * "The 3 LSBs of both offsets must be zero to ensure correct * alignment" * * We have no guarantee that tile_x and tile_y are correctly aligned, * since they are determined by the mipmap layout, which is only aligned * to multiples of 4. * * So, to avoid hanging the GPU, just smash the low order 3 bits of * tile_x and tile_y to 0. This is a temporary workaround until we come * up with a better solution. */ tile_x &= ~7; tile_y &= ~7; intel_emit_post_sync_nonzero_flush(intel); intel_emit_depth_stall_flushes(intel); BEGIN_BATCH(7); OUT_BATCH(_3DSTATE_DEPTH_BUFFER << 16 | (7 - 2)); uint32_t pitch_bytes = params->depth.mt->region->pitch * params->depth.mt->region->cpp; OUT_BATCH((pitch_bytes - 1) | params->depth_format << 18 | 1 << 21 | /* separate stencil enable */ 1 << 22 | /* hiz enable */ BRW_TILEWALK_YMAJOR << 26 | 1 << 27 | /* y-tiled */ BRW_SURFACE_2D << 29); OUT_RELOC(params->depth.mt->region->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, offset); OUT_BATCH(BRW_SURFACE_MIPMAPLAYOUT_BELOW << 1 | (params->depth.width + tile_x - 1) << 6 | (params->depth.height + tile_y - 1) << 19); OUT_BATCH(0); OUT_BATCH(tile_x | tile_y << 16); OUT_BATCH(0); ADVANCE_BATCH(); } /* 3DSTATE_HIER_DEPTH_BUFFER */ { struct intel_region *hiz_region = params->depth.mt->hiz_mt->region; uint32_t hiz_offset = intel_region_get_aligned_offset(hiz_region, draw_x & ~tile_mask_x, (draw_y & ~tile_mask_y) / 2, false); BEGIN_BATCH(3); OUT_BATCH((_3DSTATE_HIER_DEPTH_BUFFER << 16) | (3 - 2)); OUT_BATCH(hiz_region->pitch * hiz_region->cpp - 1); OUT_RELOC(hiz_region->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, hiz_offset); ADVANCE_BATCH(); } /* 3DSTATE_STENCIL_BUFFER */ { BEGIN_BATCH(3); OUT_BATCH((_3DSTATE_STENCIL_BUFFER << 16) | (3 - 2)); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } }
void gen7_emit_depth_stencil_hiz(struct brw_context *brw, struct intel_mipmap_tree *depth_mt, uint32_t depth_offset, uint32_t depthbuffer_format, uint32_t depth_surface_type, struct intel_mipmap_tree *stencil_mt, struct intel_mipmap_tree *hiz_mt, bool separate_stencil, uint32_t width, uint32_t height, uint32_t tile_x, uint32_t tile_y) { struct intel_context *intel = &brw->intel; struct gl_context *ctx = &intel->ctx; intel_emit_depth_stall_flushes(intel); /* _NEW_DEPTH, _NEW_STENCIL */ BEGIN_BATCH(7); OUT_BATCH(GEN7_3DSTATE_DEPTH_BUFFER << 16 | (7 - 2)); OUT_BATCH((depth_mt ? depth_mt->region->pitch - 1 : 0) | (depthbuffer_format << 18) | ((hiz_mt ? 1 : 0) << 22) | ((stencil_mt != NULL && ctx->Stencil._WriteEnabled) << 27) | ((ctx->Depth.Mask != 0) << 28) | (depth_surface_type << 29)); if (depth_mt) { OUT_RELOC(depth_mt->region->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, depth_offset); } else { OUT_BATCH(0); } OUT_BATCH(((width + tile_x - 1) << 4) | ((height + tile_y - 1) << 18)); OUT_BATCH(0); OUT_BATCH(tile_x | (tile_y << 16)); OUT_BATCH(0); ADVANCE_BATCH(); if (hiz_mt == NULL) { BEGIN_BATCH(3); OUT_BATCH(GEN7_3DSTATE_HIER_DEPTH_BUFFER << 16 | (3 - 2)); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } else { BEGIN_BATCH(3); OUT_BATCH(GEN7_3DSTATE_HIER_DEPTH_BUFFER << 16 | (3 - 2)); OUT_BATCH(hiz_mt->region->pitch - 1); OUT_RELOC(hiz_mt->region->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, brw->depthstencil.hiz_offset); ADVANCE_BATCH(); } if (stencil_mt == NULL) { BEGIN_BATCH(3); OUT_BATCH(GEN7_3DSTATE_STENCIL_BUFFER << 16 | (3 - 2)); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } else { const int enabled = intel->is_haswell ? HSW_STENCIL_ENABLED : 0; BEGIN_BATCH(3); OUT_BATCH(GEN7_3DSTATE_STENCIL_BUFFER << 16 | (3 - 2)); /* The stencil buffer has quirky pitch requirements. From the Graphics * BSpec: vol2a.11 3D Pipeline Windower > Early Depth/Stencil Processing * > Depth/Stencil Buffer State > 3DSTATE_STENCIL_BUFFER [DevIVB+], * field "Surface Pitch": * * The pitch must be set to 2x the value computed based on width, as * the stencil buffer is stored with two rows interleaved. * * (Note that it is not 100% clear whether this intended to apply to * Gen7; the BSpec flags this comment as "DevILK,DevSNB" (which would * imply that it doesn't), however the comment appears on a "DevIVB+" * page (which would imply that it does). Experiments with the hardware * indicate that it does. */ OUT_BATCH(enabled | (2 * stencil_mt->region->pitch - 1)); OUT_RELOC(stencil_mt->region->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, brw->depthstencil.stencil_offset); ADVANCE_BATCH(); } BEGIN_BATCH(3); OUT_BATCH(GEN7_3DSTATE_CLEAR_PARAMS << 16 | (3 - 2)); OUT_BATCH(depth_mt ? depth_mt->depth_clear_value : 0); OUT_BATCH(1); ADVANCE_BATCH(); }
/** * Enable or disable thread dispatch and set the HiZ op appropriately. */ static void gen6_blorp_emit_wm_config(struct brw_context *brw, const brw_blorp_params *params, uint32_t prog_offset, brw_blorp_prog_data *prog_data) { struct intel_context *intel = &brw->intel; uint32_t dw2, dw4, dw5, dw6; /* Even when thread dispatch is disabled, max threads (dw5.25:31) must be * nonzero to prevent the GPU from hanging. See the valid ranges in the * BSpec, Volume 2a.11 Windower, Section 3DSTATE_WM, Dword 5.25:31 * "Maximum Number Of Threads". * * To be safe (and to minimize extraneous code) we go ahead and fully * configure the WM state whether or not there is a WM program. */ dw2 = dw4 = dw5 = dw6 = 0; switch (params->hiz_op) { case GEN6_HIZ_OP_DEPTH_CLEAR: dw4 |= GEN6_WM_DEPTH_CLEAR; break; case GEN6_HIZ_OP_DEPTH_RESOLVE: dw4 |= GEN6_WM_DEPTH_RESOLVE; break; case GEN6_HIZ_OP_HIZ_RESOLVE: dw4 |= GEN6_WM_HIERARCHICAL_DEPTH_RESOLVE; break; case GEN6_HIZ_OP_NONE: break; default: assert(0); break; } dw4 |= GEN6_WM_STATISTICS_ENABLE; dw5 |= GEN6_WM_LINE_AA_WIDTH_1_0; dw5 |= GEN6_WM_LINE_END_CAP_AA_WIDTH_0_5; dw5 |= (brw->max_wm_threads - 1) << GEN6_WM_MAX_THREADS_SHIFT; dw6 |= 0 << GEN6_WM_BARYCENTRIC_INTERPOLATION_MODE_SHIFT; /* No interp */ dw6 |= 0 << GEN6_WM_NUM_SF_OUTPUTS_SHIFT; /* No inputs from SF */ if (params->use_wm_prog) { dw2 |= 1 << GEN6_WM_SAMPLER_COUNT_SHIFT; /* Up to 4 samplers */ dw4 |= prog_data->first_curbe_grf << GEN6_WM_DISPATCH_START_GRF_SHIFT_0; dw5 |= GEN6_WM_16_DISPATCH_ENABLE; dw5 |= GEN6_WM_KILL_ENABLE; /* TODO: temporarily smash on */ dw5 |= GEN6_WM_DISPATCH_ENABLE; /* We are rendering */ } if (params->num_samples > 1) { dw6 |= GEN6_WM_MSRAST_ON_PATTERN; if (prog_data && prog_data->persample_msaa_dispatch) dw6 |= GEN6_WM_MSDISPMODE_PERSAMPLE; else dw6 |= GEN6_WM_MSDISPMODE_PERPIXEL; } else { dw6 |= GEN6_WM_MSRAST_OFF_PIXEL; dw6 |= GEN6_WM_MSDISPMODE_PERSAMPLE; } BEGIN_BATCH(9); OUT_BATCH(_3DSTATE_WM << 16 | (9 - 2)); OUT_BATCH(params->use_wm_prog ? prog_offset : 0); OUT_BATCH(dw2); OUT_BATCH(0); /* No scratch needed */ OUT_BATCH(dw4); OUT_BATCH(dw5); OUT_BATCH(dw6); OUT_BATCH(0); /* No other programs */ OUT_BATCH(0); /* No other programs */ ADVANCE_BATCH(); }
void gen6_blorp_emit_vertices(struct brw_context *brw, const brw_blorp_params *params) { struct intel_context *intel = &brw->intel; uint32_t vertex_offset; /* Setup VBO for the rectangle primitive.. * * A rectangle primitive (3DPRIM_RECTLIST) consists of only three * vertices. The vertices reside in screen space with DirectX coordinates * (that is, (0, 0) is the upper left corner). * * v2 ------ implied * | | * | | * v0 ----- v1 * * Since the VS is disabled, the clipper loads each VUE directly from * the URB. This is controlled by the 3DSTATE_VERTEX_BUFFERS and * 3DSTATE_VERTEX_ELEMENTS packets below. The VUE contents are as follows: * dw0: Reserved, MBZ. * dw1: Render Target Array Index. The HiZ op does not use indexed * vertices, so set the dword to 0. * dw2: Viewport Index. The HiZ op disables viewport mapping and * scissoring, so set the dword to 0. * dw3: Point Width: The HiZ op does not emit the POINTLIST primitive, so * set the dword to 0. * dw4: Vertex Position X. * dw5: Vertex Position Y. * dw6: Vertex Position Z. * dw7: Vertex Position W. * * For details, see the Sandybridge PRM, Volume 2, Part 1, Section 1.5.1 * "Vertex URB Entry (VUE) Formats". */ { float *vertex_data; const float vertices[GEN6_BLORP_VBO_SIZE] = { /* v0 */ 0, 0, 0, 0, (float) params->x0, (float) params->y1, 0, 1, /* v1 */ 0, 0, 0, 0, (float) params->x1, (float) params->y1, 0, 1, /* v2 */ 0, 0, 0, 0, (float) params->x0, (float) params->y0, 0, 1, }; vertex_data = (float *) brw_state_batch(brw, AUB_TRACE_VERTEX_BUFFER, GEN6_BLORP_VBO_SIZE, 32, &vertex_offset); memcpy(vertex_data, vertices, GEN6_BLORP_VBO_SIZE); } /* 3DSTATE_VERTEX_BUFFERS */ { const int num_buffers = 1; const int batch_length = 1 + 4 * num_buffers; uint32_t dw0 = GEN6_VB0_ACCESS_VERTEXDATA | (GEN6_BLORP_NUM_VUE_ELEMS * sizeof(float)) << BRW_VB0_PITCH_SHIFT; if (intel->gen >= 7) dw0 |= GEN7_VB0_ADDRESS_MODIFYENABLE; BEGIN_BATCH(batch_length); OUT_BATCH((_3DSTATE_VERTEX_BUFFERS << 16) | (batch_length - 2)); OUT_BATCH(dw0); /* start address */ OUT_RELOC(intel->batch.bo, I915_GEM_DOMAIN_VERTEX, 0, vertex_offset); /* end address */ OUT_RELOC(intel->batch.bo, I915_GEM_DOMAIN_VERTEX, 0, vertex_offset + GEN6_BLORP_VBO_SIZE - 1); OUT_BATCH(0); ADVANCE_BATCH(); } /* 3DSTATE_VERTEX_ELEMENTS * * Fetch dwords 0 - 7 from each VUE. See the comments above where * the vertex_bo is filled with data. */ { const int num_elements = 2; const int batch_length = 1 + 2 * num_elements; BEGIN_BATCH(batch_length); OUT_BATCH((_3DSTATE_VERTEX_ELEMENTS << 16) | (batch_length - 2)); /* Element 0 */ OUT_BATCH(GEN6_VE0_VALID | BRW_SURFACEFORMAT_R32G32B32A32_FLOAT << BRW_VE0_FORMAT_SHIFT | 0 << BRW_VE0_SRC_OFFSET_SHIFT); OUT_BATCH(BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_0_SHIFT | BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_1_SHIFT | BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_2_SHIFT | BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_3_SHIFT); /* Element 1 */ OUT_BATCH(GEN6_VE0_VALID | BRW_SURFACEFORMAT_R32G32B32A32_FLOAT << BRW_VE0_FORMAT_SHIFT | 16 << BRW_VE0_SRC_OFFSET_SHIFT); OUT_BATCH(BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_0_SHIFT | BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_1_SHIFT | BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_2_SHIFT | BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_3_SHIFT); ADVANCE_BATCH(); } }
static void gen8_upload_ds_state(struct brw_context *brw) { struct gl_context *ctx = &brw->ctx; const struct brw_stage_state *stage_state = &brw->tes.base; /* BRW_NEW_TESS_EVAL_PROGRAM */ bool active = brw->tess_eval_program; assert(!active || brw->tess_ctrl_program); /* BRW_NEW_TES_PROG_DATA */ const struct brw_tes_prog_data *tes_prog_data = brw->tes.prog_data; const struct brw_vue_prog_data *vue_prog_data = &tes_prog_data->base; const struct brw_stage_prog_data *prog_data = &vue_prog_data->base; if (active) { BEGIN_BATCH(9); OUT_BATCH(_3DSTATE_DS << 16 | (9 - 2)); OUT_BATCH(stage_state->prog_offset); OUT_BATCH(0); OUT_BATCH(SET_FIELD(DIV_ROUND_UP(stage_state->sampler_count, 4), GEN7_DS_SAMPLER_COUNT) | SET_FIELD(prog_data->binding_table.size_bytes / 4, GEN7_DS_BINDING_TABLE_ENTRY_COUNT)); if (prog_data->total_scratch) { OUT_RELOC64(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, ffs(prog_data->total_scratch) - 11); } else { OUT_BATCH(0); OUT_BATCH(0); } OUT_BATCH(SET_FIELD(prog_data->dispatch_grf_start_reg, GEN7_DS_DISPATCH_START_GRF) | SET_FIELD(vue_prog_data->urb_read_length, GEN7_DS_URB_READ_LENGTH)); OUT_BATCH(GEN7_DS_ENABLE | GEN7_DS_STATISTICS_ENABLE | (brw->max_ds_threads - 1) << HSW_DS_MAX_THREADS_SHIFT | (vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8 ? GEN7_DS_SIMD8_DISPATCH_ENABLE : 0) | (tes_prog_data->domain == BRW_TESS_DOMAIN_TRI ? GEN7_DS_COMPUTE_W_COORDINATE_ENABLE : 0)); OUT_BATCH(SET_FIELD(ctx->Transform.ClipPlanesEnabled, GEN8_DS_USER_CLIP_DISTANCE)); ADVANCE_BATCH(); } else { BEGIN_BATCH(9); OUT_BATCH(_3DSTATE_DS << 16 | (9 - 2)); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } brw->tes.enabled = active; }
/** * \copydoc gen6_hiz_exec() */ static void gen7_hiz_exec(struct intel_context *intel, struct intel_mipmap_tree *mt, unsigned int level, unsigned int layer, enum gen6_hiz_op op) { struct gl_context *ctx = &intel->ctx; struct brw_context *brw = brw_context(ctx); assert(op != GEN6_HIZ_OP_DEPTH_CLEAR); /* Not implemented yet. */ assert(mt->hiz_mt != NULL); intel_miptree_check_level_layer(mt, level, layer); uint32_t depth_format; switch (mt->format) { case MESA_FORMAT_Z16: depth_format = BRW_DEPTHFORMAT_D16_UNORM; break; case MESA_FORMAT_Z32_FLOAT: depth_format = BRW_DEPTHFORMAT_D32_FLOAT; break; case MESA_FORMAT_X8_Z24: depth_format = BRW_DEPTHFORMAT_D24_UNORM_X8_UINT; break; default: assert(0); break; } gen6_hiz_emit_batch_head(brw); gen6_hiz_emit_vertices(brw, mt, level, layer); /* 3DSTATE_URB_VS * 3DSTATE_URB_HS * 3DSTATE_URB_DS * 3DSTATE_URB_GS * * If the 3DSTATE_URB_VS is emitted, than the others must be also. From the * BSpec, Volume 2a "3D Pipeline Overview", Section 1.7.1 3DSTATE_URB_VS: * 3DSTATE_URB_HS, 3DSTATE_URB_DS, and 3DSTATE_URB_GS must also be * programmed in order for the programming of this state to be * valid. */ { /* The minimum valid value is 32. See 3DSTATE_URB_VS, * Dword 1.15:0 "VS Number of URB Entries". */ int num_vs_entries = 32; BEGIN_BATCH(2); OUT_BATCH(_3DSTATE_URB_VS << 16 | (2 - 2)); OUT_BATCH(1 << GEN7_URB_ENTRY_SIZE_SHIFT | 0 << GEN7_URB_STARTING_ADDRESS_SHIFT | num_vs_entries); ADVANCE_BATCH(); BEGIN_BATCH(2); OUT_BATCH(_3DSTATE_URB_GS << 16 | (2 - 2)); OUT_BATCH(0); ADVANCE_BATCH(); BEGIN_BATCH(2); OUT_BATCH(_3DSTATE_URB_HS << 16 | (2 - 2)); OUT_BATCH(0); ADVANCE_BATCH(); BEGIN_BATCH(2); OUT_BATCH(_3DSTATE_URB_DS << 16 | (2 - 2)); OUT_BATCH(0); ADVANCE_BATCH(); } /* 3DSTATE_DEPTH_STENCIL_STATE_POINTERS * * The offset is relative to CMD_STATE_BASE_ADDRESS.DynamicStateBaseAddress. */ { uint32_t depthstencil_offset; gen6_hiz_emit_depth_stencil_state(brw, op, &depthstencil_offset); BEGIN_BATCH(2); OUT_BATCH(_3DSTATE_DEPTH_STENCIL_STATE_POINTERS << 16 | (2 - 2)); OUT_BATCH(depthstencil_offset | 1); ADVANCE_BATCH(); } /* 3DSTATE_VS * * Disable vertex shader. */ { BEGIN_BATCH(6); OUT_BATCH(_3DSTATE_VS << 16 | (6 - 2)); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } /* 3DSTATE_HS * * Disable the hull shader. */ { BEGIN_BATCH(7); OUT_BATCH(_3DSTATE_HS << 16 | (7 - 2)); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } /* 3DSTATE_TE * * Disable the tesselation engine. */ { BEGIN_BATCH(4); OUT_BATCH(_3DSTATE_TE << 16 | (4 - 2)); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } /* 3DSTATE_DS * * Disable the domain shader. */ { BEGIN_BATCH(6); OUT_BATCH(_3DSTATE_DS << 16 | (6 - 2)); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } /* 3DSTATE_GS * * Disable the geometry shader. */ { BEGIN_BATCH(7); OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2)); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } /* 3DSTATE_STREAMOUT * * Disable streamout. */ { BEGIN_BATCH(3); OUT_BATCH(_3DSTATE_STREAMOUT << 16 | (3 - 2)); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } /* 3DSTATE_CLIP * * Disable the clipper. * * The HiZ op emits a rectangle primitive, which requires clipping to * be disabled. From page 10 of the Sandy Bridge PRM Volume 2 Part 1 * Section 1.3 "3D Primitives Overview": * RECTLIST: * Either the CLIP unit should be DISABLED, or the CLIP unit's Clip * Mode should be set to a value other than CLIPMODE_NORMAL. * * Also disable perspective divide. This doesn't change the clipper's * output, but does spare a few electrons. */ { BEGIN_BATCH(4); OUT_BATCH(_3DSTATE_CLIP << 16 | (4 - 2)); OUT_BATCH(0); OUT_BATCH(GEN6_CLIP_PERSPECTIVE_DIVIDE_DISABLE); OUT_BATCH(0); ADVANCE_BATCH(); } /* 3DSTATE_SF * * Disable ViewportTransformEnable (dw1.1) * * From the SandyBridge PRM, Volume 2, Part 1, Section 1.3, "3D * Primitives Overview": * RECTLIST: Viewport Mapping must be DISABLED (as is typical with the * use of screen- space coordinates). * * A solid rectangle must be rendered, so set FrontFaceFillMode (dw1.6:5) * and BackFaceFillMode (dw1.4:3) to SOLID(0). * * From the Sandy Bridge PRM, Volume 2, Part 1, Section * 6.4.1.1 3DSTATE_SF, Field FrontFaceFillMode: * SOLID: Any triangle or rectangle object found to be front-facing * is rendered as a solid object. This setting is required when * (rendering rectangle (RECTLIST) objects. */ { BEGIN_BATCH(7); OUT_BATCH(_3DSTATE_SF << 16 | (7 - 2)); OUT_BATCH(depth_format << GEN7_SF_DEPTH_BUFFER_SURFACE_FORMAT_SHIFT); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } /* 3DSTATE_SBE */ { BEGIN_BATCH(14); OUT_BATCH(_3DSTATE_SBE << 16 | (14 - 2)); OUT_BATCH((1 - 1) << GEN7_SBE_NUM_OUTPUTS_SHIFT | /* only position */ 1 << GEN7_SBE_URB_ENTRY_READ_LENGTH_SHIFT | 0 << GEN7_SBE_URB_ENTRY_READ_OFFSET_SHIFT); for (int i = 0; i < 12; ++i) OUT_BATCH(0); ADVANCE_BATCH(); } /* 3DSTATE_WM * * Disable PS thread dispatch (dw1.29) and enable the HiZ op. */ { uint32_t dw1 = 0; switch (op) { case GEN6_HIZ_OP_DEPTH_CLEAR: assert(!"not implemented"); dw1 |= GEN7_WM_DEPTH_CLEAR; break; case GEN6_HIZ_OP_DEPTH_RESOLVE: dw1 |= GEN7_WM_DEPTH_RESOLVE; break; case GEN6_HIZ_OP_HIZ_RESOLVE: dw1 |= GEN7_WM_HIERARCHICAL_DEPTH_RESOLVE; break; default: assert(0); break; } BEGIN_BATCH(3); OUT_BATCH(_3DSTATE_WM << 16 | (3 - 2)); OUT_BATCH(dw1); OUT_BATCH(0); ADVANCE_BATCH(); } /* 3DSTATE_PS * * Pixel shader dispatch is disabled above in 3DSTATE_WM, dw1.29. Despite * that, thread dispatch info must still be specified. * - Maximum Number of Threads (dw4.24:31) must be nonzero, as the BSpec * states that the valid range for this field is [0x3, 0x2f]. * - A dispatch mode must be given; that is, at least one of the * "N Pixel Dispatch Enable" (N=8,16,32) fields must be set. This was * discovered through simulator error messages. */ { BEGIN_BATCH(8); OUT_BATCH(_3DSTATE_PS << 16 | (8 - 2)); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(((brw->max_wm_threads - 1) << IVB_PS_MAX_THREADS_SHIFT) | GEN7_PS_32_DISPATCH_ENABLE); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } /* 3DSTATE_DEPTH_BUFFER */ { uint32_t width = mt->level[level].width; uint32_t height = mt->level[level].height; uint32_t tile_x; uint32_t tile_y; uint32_t offset; { /* Construct a dummy renderbuffer just to extract tile offsets. */ struct intel_renderbuffer rb; rb.mt = mt; rb.mt_level = level; rb.mt_layer = layer; intel_renderbuffer_set_draw_offset(&rb); offset = intel_renderbuffer_tile_offsets(&rb, &tile_x, &tile_y); } intel_emit_depth_stall_flushes(intel); BEGIN_BATCH(7); OUT_BATCH(GEN7_3DSTATE_DEPTH_BUFFER << 16 | (7 - 2)); OUT_BATCH(((mt->region->pitch * mt->region->cpp) - 1) | depth_format << 18 | 1 << 22 | /* hiz enable */ 1 << 28 | /* depth write */ BRW_SURFACE_2D << 29); OUT_RELOC(mt->region->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, offset); OUT_BATCH((width + tile_x - 1) << 4 | (height + tile_y - 1) << 18); OUT_BATCH(0); OUT_BATCH(tile_x | tile_y << 16); OUT_BATCH(0); ADVANCE_BATCH(); } /* 3DSTATE_HIER_DEPTH_BUFFER */ { struct intel_region *hiz_region = mt->hiz_mt->region; BEGIN_BATCH(3); OUT_BATCH((GEN7_3DSTATE_HIER_DEPTH_BUFFER << 16) | (3 - 2)); OUT_BATCH(hiz_region->pitch * hiz_region->cpp - 1); OUT_RELOC(hiz_region->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0); ADVANCE_BATCH(); } /* 3DSTATE_STENCIL_BUFFER */ { BEGIN_BATCH(3); OUT_BATCH((GEN7_3DSTATE_STENCIL_BUFFER << 16) | (3 - 2)); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } /* 3DSTATE_CLEAR_PARAMS * * From the BSpec, Volume 2a.11 Windower, Section 1.5.6.3.2 * 3DSTATE_CLEAR_PARAMS: * [DevIVB] 3DSTATE_CLEAR_PARAMS must always be programmed in the along * with the other Depth/Stencil state commands(i.e. 3DSTATE_DEPTH_BUFFER, * 3DSTATE_STENCIL_BUFFER, or 3DSTATE_HIER_DEPTH_BUFFER). */ { BEGIN_BATCH(3); OUT_BATCH(GEN7_3DSTATE_CLEAR_PARAMS << 16 | (3 - 2)); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } /* 3DSTATE_DRAWING_RECTANGLE */ { BEGIN_BATCH(4); OUT_BATCH(_3DSTATE_DRAWING_RECTANGLE << 16 | (4 - 2)); OUT_BATCH(0); OUT_BATCH(((mt->level[level].width - 1) & 0xffff) | ((mt->level[level].height - 1) << 16)); OUT_BATCH(0); ADVANCE_BATCH(); } /* 3DPRIMITIVE */ { BEGIN_BATCH(7); OUT_BATCH(CMD_3D_PRIM << 16 | (7 - 2)); OUT_BATCH(GEN7_3DPRIM_VERTEXBUFFER_ACCESS_SEQUENTIAL | _3DPRIM_RECTLIST); OUT_BATCH(3); /* vertex count per instance */ OUT_BATCH(0); OUT_BATCH(1); /* instance count */ OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } /* See comments above at first invocation of intel_flush() in * gen6_hiz_emit_batch_head(). */ intel_flush(ctx); /* Be safe. */ brw->state.dirty.brw = ~0; brw->state.dirty.cache = ~0; }
static void upload_gs_state(struct brw_context *brw) { /* BRW_NEW_GEOMETRY_PROGRAM */ bool active = brw->geometry_program; /* BRW_NEW_GS_PROG_DATA */ const struct brw_vue_prog_data *prog_data = &brw->gs.prog_data->base; const struct brw_stage_state *stage_state = &brw->gs.base; if (!active || stage_state->push_const_size == 0) { /* Disable the push constant buffers. */ BEGIN_BATCH(5); OUT_BATCH(_3DSTATE_CONSTANT_GS << 16 | (5 - 2)); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } else { BEGIN_BATCH(5); OUT_BATCH(_3DSTATE_CONSTANT_GS << 16 | GEN6_CONSTANT_BUFFER_0_ENABLE | (5 - 2)); /* Pointer to the GS constant buffer. Covered by the set of * state flags from gen6_upload_vs_constants */ OUT_BATCH(stage_state->push_const_offset + stage_state->push_const_size - 1); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } if (active) { BEGIN_BATCH(7); OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2)); OUT_BATCH(stage_state->prog_offset); /* GEN6_GS_SPF_MODE and GEN6_GS_VECTOR_MASK_ENABLE are enabled as it * was previously done for gen6. * * TODO: test with both disabled to see if the HW is behaving * as expected, like in gen7. */ OUT_BATCH(GEN6_GS_SPF_MODE | GEN6_GS_VECTOR_MASK_ENABLE | ((ALIGN(stage_state->sampler_count, 4)/4) << GEN6_GS_SAMPLER_COUNT_SHIFT) | ((prog_data->base.binding_table.size_bytes / 4) << GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT)); if (prog_data->base.total_scratch) { OUT_RELOC(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, ffs(prog_data->base.total_scratch) - 11); } else { OUT_BATCH(0); /* no scratch space */ } OUT_BATCH((prog_data->urb_read_length << GEN6_GS_URB_READ_LENGTH_SHIFT) | (0 << GEN6_GS_URB_ENTRY_READ_OFFSET_SHIFT) | (prog_data->base.dispatch_grf_start_reg << GEN6_GS_DISPATCH_START_GRF_SHIFT)); OUT_BATCH(((brw->max_gs_threads - 1) << GEN6_GS_MAX_THREADS_SHIFT) | GEN6_GS_STATISTICS_ENABLE | GEN6_GS_SO_STATISTICS_ENABLE | GEN6_GS_RENDERING_ENABLE); if (brw->gs.prog_data->gen6_xfb_enabled) { /* GEN6_GS_REORDER is equivalent to GEN7_GS_REORDER_TRAILING * in gen7. SNB and IVB specs are the same regarding the reordering of * TRISTRIP/TRISTRIP_REV vertices and triangle orientation, so we do * the same thing in both generations. For more details, see the * comment in gen7_gs_state.c */ OUT_BATCH(GEN6_GS_REORDER | GEN6_GS_SVBI_PAYLOAD_ENABLE | GEN6_GS_ENABLE); } else { OUT_BATCH(GEN6_GS_REORDER | GEN6_GS_ENABLE); } ADVANCE_BATCH(); } else if (brw->ff_gs.prog_active) { /* In gen6, transform feedback for the VS stage is done with an ad-hoc GS * program. This function provides the needed 3DSTATE_GS for this. */ upload_gs_state_for_tf(brw); } else { /* No GS function required */ BEGIN_BATCH(7); OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2)); OUT_BATCH(0); /* prog_bo */ OUT_BATCH((0 << GEN6_GS_SAMPLER_COUNT_SHIFT) | (0 << GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT)); OUT_BATCH(0); /* scratch space base offset */ OUT_BATCH((1 << GEN6_GS_DISPATCH_START_GRF_SHIFT) | (0 << GEN6_GS_URB_READ_LENGTH_SHIFT) | (0 << GEN6_GS_URB_ENTRY_READ_OFFSET_SHIFT)); OUT_BATCH((0 << GEN6_GS_MAX_THREADS_SHIFT) | GEN6_GS_STATISTICS_ENABLE | GEN6_GS_RENDERING_ENABLE); OUT_BATCH(0); ADVANCE_BATCH(); } brw->gs.enabled = active; }
/* Upload a new set of constants. Too much variability to go into the * cache mechanism, but maybe would benefit from a comparison against * the current uploaded set of constants. */ static void brw_upload_constant_buffer(struct brw_context *brw) { struct gl_context *ctx = &brw->ctx; const GLuint sz = brw->curbe.total_size; const GLuint bufsz = sz * 16 * sizeof(GLfloat); GLfloat *buf; GLuint i; gl_clip_plane *clip_planes; if (sz == 0) { brw->curbe.last_bufsz = 0; goto emit; } buf = brw->curbe.next_buf; /* fragment shader constants */ if (brw->curbe.wm_size) { GLuint offset = brw->curbe.wm_start * 16; /* copy float constants */ for (i = 0; i < brw->wm.prog_data->nr_params; i++) { buf[offset + i] = *brw->wm.prog_data->param[i]; } } /* clipper constants */ if (brw->curbe.clip_size) { GLuint offset = brw->curbe.clip_start * 16; GLuint j; /* If any planes are going this way, send them all this way: */ for (i = 0; i < 6; i++) { buf[offset + i * 4 + 0] = fixed_plane[i][0]; buf[offset + i * 4 + 1] = fixed_plane[i][1]; buf[offset + i * 4 + 2] = fixed_plane[i][2]; buf[offset + i * 4 + 3] = fixed_plane[i][3]; } /* Clip planes: _NEW_TRANSFORM plus _NEW_PROJECTION to get to * clip-space: */ clip_planes = brw_select_clip_planes(ctx); for (j = 0; j < MAX_CLIP_PLANES; j++) { if (ctx->Transform.ClipPlanesEnabled & (1<<j)) { buf[offset + i * 4 + 0] = clip_planes[j][0]; buf[offset + i * 4 + 1] = clip_planes[j][1]; buf[offset + i * 4 + 2] = clip_planes[j][2]; buf[offset + i * 4 + 3] = clip_planes[j][3]; i++; } } } /* vertex shader constants */ if (brw->curbe.vs_size) { GLuint offset = brw->curbe.vs_start * 16; for (i = 0; i < brw->vs.prog_data->base.nr_params; i++) { buf[offset + i] = *brw->vs.prog_data->base.param[i]; } } if (0) { for (i = 0; i < sz*16; i+=4) printf("curbe %d.%d: %f %f %f %f\n", i/8, i&4, buf[i+0], buf[i+1], buf[i+2], buf[i+3]); printf("last_buf %p buf %p sz %d/%d cmp %d\n", brw->curbe.last_buf, buf, bufsz, brw->curbe.last_bufsz, brw->curbe.last_buf ? memcmp(buf, brw->curbe.last_buf, bufsz) : -1); } if (brw->curbe.curbe_bo != NULL && bufsz == brw->curbe.last_bufsz && memcmp(buf, brw->curbe.last_buf, bufsz) == 0) { /* constants have not changed */ } else { /* Update the record of what our last set of constants was. We * don't just flip the pointers because we don't fill in the * data in the padding between the entries. */ memcpy(brw->curbe.last_buf, buf, bufsz); brw->curbe.last_bufsz = bufsz; if (brw->curbe.curbe_bo != NULL && brw->curbe.curbe_next_offset + bufsz > brw->curbe.curbe_bo->size) { drm_intel_gem_bo_unmap_gtt(brw->curbe.curbe_bo); drm_intel_bo_unreference(brw->curbe.curbe_bo); brw->curbe.curbe_bo = NULL; } if (brw->curbe.curbe_bo == NULL) { /* Allocate a single page for CURBE entries for this batchbuffer. * They're generally around 64b. */ brw->curbe.curbe_bo = drm_intel_bo_alloc(brw->bufmgr, "CURBE", 4096, 1 << 6); brw->curbe.curbe_next_offset = 0; drm_intel_gem_bo_map_gtt(brw->curbe.curbe_bo); assert(bufsz < 4096); } brw->curbe.curbe_offset = brw->curbe.curbe_next_offset; brw->curbe.curbe_next_offset += bufsz; brw->curbe.curbe_next_offset = ALIGN(brw->curbe.curbe_next_offset, 64); /* Copy data to the buffer: */ memcpy(brw->curbe.curbe_bo->virtual + brw->curbe.curbe_offset, buf, bufsz); } /* Because this provokes an action (ie copy the constants into the * URB), it shouldn't be shortcircuited if identical to the * previous time - because eg. the urb destination may have * changed, or the urb contents different to last time. * * Note that the data referred to is actually copied internally, * not just used in place according to passed pointer. * * It appears that the CS unit takes care of using each available * URB entry (Const URB Entry == CURBE) in turn, and issuing * flushes as necessary when doublebuffering of CURBEs isn't * possible. */ emit: BEGIN_BATCH(2); if (brw->curbe.total_size == 0) { OUT_BATCH((CMD_CONST_BUFFER << 16) | (2 - 2)); OUT_BATCH(0); } else { OUT_BATCH((CMD_CONST_BUFFER << 16) | (1 << 8) | (2 - 2)); OUT_RELOC(brw->curbe.curbe_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, (brw->curbe.total_size - 1) + brw->curbe.curbe_offset); } ADVANCE_BATCH(); }
static void upload_sf_state(struct brw_context *brw) { struct intel_context *intel = &brw->intel; struct gl_context *ctx = &intel->ctx; uint32_t dw1, dw2, dw3; float point_size; /* _NEW_BUFFERS */ bool render_to_fbo = _mesa_is_user_fbo(brw->intel.ctx.DrawBuffer); bool multisampled_fbo = ctx->DrawBuffer->Visual.samples > 1; dw1 = GEN6_SF_STATISTICS_ENABLE | GEN6_SF_VIEWPORT_TRANSFORM_ENABLE; /* _NEW_BUFFERS */ dw1 |= (brw_depthbuffer_format(brw) << GEN7_SF_DEPTH_BUFFER_SURFACE_FORMAT_SHIFT); /* _NEW_POLYGON */ if ((ctx->Polygon.FrontFace == GL_CCW) ^ render_to_fbo) dw1 |= GEN6_SF_WINDING_CCW; if (ctx->Polygon.OffsetFill) dw1 |= GEN6_SF_GLOBAL_DEPTH_OFFSET_SOLID; if (ctx->Polygon.OffsetLine) dw1 |= GEN6_SF_GLOBAL_DEPTH_OFFSET_WIREFRAME; if (ctx->Polygon.OffsetPoint) dw1 |= GEN6_SF_GLOBAL_DEPTH_OFFSET_POINT; switch (ctx->Polygon.FrontMode) { case GL_FILL: dw1 |= GEN6_SF_FRONT_SOLID; break; case GL_LINE: dw1 |= GEN6_SF_FRONT_WIREFRAME; break; case GL_POINT: dw1 |= GEN6_SF_FRONT_POINT; break; default: assert(0); break; } switch (ctx->Polygon.BackMode) { case GL_FILL: dw1 |= GEN6_SF_BACK_SOLID; break; case GL_LINE: dw1 |= GEN6_SF_BACK_WIREFRAME; break; case GL_POINT: dw1 |= GEN6_SF_BACK_POINT; break; default: assert(0); break; } dw2 = 0; if (ctx->Polygon.CullFlag) { switch (ctx->Polygon.CullFaceMode) { case GL_FRONT: dw2 |= GEN6_SF_CULL_FRONT; break; case GL_BACK: dw2 |= GEN6_SF_CULL_BACK; break; case GL_FRONT_AND_BACK: dw2 |= GEN6_SF_CULL_BOTH; break; default: assert(0); break; } } else { dw2 |= GEN6_SF_CULL_NONE; } /* _NEW_SCISSOR */ if (ctx->Scissor.Enabled) dw2 |= GEN6_SF_SCISSOR_ENABLE; /* _NEW_LINE */ { uint32_t line_width_u3_7 = U_FIXED(CLAMP(ctx->Line.Width, 0.0, 7.99), 7); /* TODO: line width of 0 is not allowed when MSAA enabled */ if (line_width_u3_7 == 0) line_width_u3_7 = 1; dw2 |= line_width_u3_7 << GEN6_SF_LINE_WIDTH_SHIFT; } if (ctx->Line.SmoothFlag) { dw2 |= GEN6_SF_LINE_AA_ENABLE; dw2 |= GEN6_SF_LINE_END_CAP_WIDTH_1_0; } if (ctx->Line.StippleFlag && intel->is_haswell) { dw2 |= HSW_SF_LINE_STIPPLE_ENABLE; } /* _NEW_MULTISAMPLE */ if (multisampled_fbo && ctx->Multisample.Enabled) dw2 |= GEN6_SF_MSRAST_ON_PATTERN; /* FINISHME: Last Pixel Enable? Vertex Sub Pixel Precision Select? */ dw3 = GEN6_SF_LINE_AA_MODE_TRUE; /* _NEW_PROGRAM | _NEW_POINT */ if (!(ctx->VertexProgram.PointSizeEnabled || ctx->Point._Attenuated)) dw3 |= GEN6_SF_USE_STATE_POINT_WIDTH; /* Clamp to ARB_point_parameters user limits */ point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize); /* Clamp to the hardware limits and convert to fixed point */ dw3 |= U_FIXED(CLAMP(point_size, 0.125, 255.875), 3); /* _NEW_LIGHT */ if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) { dw3 |= (2 << GEN6_SF_TRI_PROVOKE_SHIFT) | (2 << GEN6_SF_TRIFAN_PROVOKE_SHIFT) | (1 << GEN6_SF_LINE_PROVOKE_SHIFT); } else { dw3 |= (1 << GEN6_SF_TRIFAN_PROVOKE_SHIFT); } BEGIN_BATCH(7); OUT_BATCH(_3DSTATE_SF << 16 | (7 - 2)); OUT_BATCH(dw1); OUT_BATCH(dw2); OUT_BATCH(dw3); OUT_BATCH_F(ctx->Polygon.OffsetUnits * 2); /* constant. copied from gen4 */ OUT_BATCH_F(ctx->Polygon.OffsetFactor); /* scale */ OUT_BATCH_F(0.0); /* XXX: global depth offset clamp */ ADVANCE_BATCH(); }
/* Copy BitBlt */ void intelEmitCopyBlit(struct intel_context *intel, GLuint cpp, GLshort src_pitch, dri_bo *src_buffer, GLuint src_offset, GLboolean src_tiled, GLshort dst_pitch, dri_bo *dst_buffer, GLuint dst_offset, GLboolean dst_tiled, GLshort src_x, GLshort src_y, GLshort dst_x, GLshort dst_y, GLshort w, GLshort h, GLenum logic_op) { GLuint CMD, BR13; int dst_y2 = dst_y + h; int dst_x2 = dst_x + w; int ret; BATCH_LOCALS; /* do space/cliprects check before going any further */ intel_batchbuffer_require_space(intel->batch, 8 * 4, NO_LOOP_CLIPRECTS); again: ret = dri_bufmgr_check_aperture_space(dst_buffer); ret |= dri_bufmgr_check_aperture_space(src_buffer); if (ret) { intel_batchbuffer_flush(intel->batch); goto again; } DBG("%s src:buf(%p)/%d+%d %d,%d dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n", __FUNCTION__, src_buffer, src_pitch, src_offset, src_x, src_y, dst_buffer, dst_pitch, dst_offset, dst_x, dst_y, w, h); src_pitch *= cpp; dst_pitch *= cpp; BR13 = translate_raster_op(logic_op) << 16; switch (cpp) { case 1: case 2: case 3: BR13 |= (1 << 24); CMD = XY_SRC_COPY_BLT_CMD; break; case 4: BR13 |= (1 << 24) | (1 << 25); CMD = XY_SRC_COPY_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB; break; default: return; } #ifndef I915 if (dst_tiled) { CMD |= XY_DST_TILED; dst_pitch /= 4; } if (src_tiled) { CMD |= XY_SRC_TILED; src_pitch /= 4; } #endif if (dst_y2 <= dst_y || dst_x2 <= dst_x) { return; } /* Initial y values don't seem to work with negative pitches. If * we adjust the offsets manually (below), it seems to work fine. * * On the other hand, if we always adjust, the hardware doesn't * know which blit directions to use, so overlapping copypixels get * the wrong result. */ if (dst_pitch > 0 && src_pitch > 0) { assert(dst_x < dst_x2); assert(dst_y < dst_y2); BEGIN_BATCH(8, NO_LOOP_CLIPRECTS); OUT_BATCH(CMD); OUT_BATCH(BR13 | dst_pitch); OUT_BATCH((dst_y << 16) | dst_x); OUT_BATCH((dst_y2 << 16) | dst_x2); OUT_RELOC(dst_buffer, DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_WRITE, dst_offset); OUT_BATCH((src_y << 16) | src_x); OUT_BATCH(src_pitch); OUT_RELOC(src_buffer, DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ, src_offset); ADVANCE_BATCH(); } else { assert(dst_x < dst_x2); assert(h > 0); BEGIN_BATCH(8, NO_LOOP_CLIPRECTS); OUT_BATCH(CMD); OUT_BATCH(BR13 | ((uint16_t)dst_pitch)); OUT_BATCH((0 << 16) | dst_x); OUT_BATCH((h << 16) | dst_x2); OUT_RELOC(dst_buffer, DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_WRITE, dst_offset + dst_y * dst_pitch); OUT_BATCH((0 << 16) | src_x); OUT_BATCH(src_pitch); OUT_RELOC(src_buffer, DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ, src_offset + src_y * src_pitch); ADVANCE_BATCH(); } }
static void upload_sbe_state(struct brw_context *brw) { struct intel_context *intel = &brw->intel; struct gl_context *ctx = &intel->ctx; /* BRW_NEW_FRAGMENT_PROGRAM */ uint32_t num_outputs = _mesa_bitcount_64(brw->fragment_program->Base.InputsRead); /* _NEW_LIGHT */ bool shade_model_flat = ctx->Light.ShadeModel == GL_FLAT; uint32_t dw1, dw10, dw11; int i; int attr = 0, input_index = 0; int urb_entry_read_offset = 1; uint16_t attr_overrides[VARYING_SLOT_MAX]; /* _NEW_BUFFERS */ bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer); uint32_t point_sprite_origin; /* FINISHME: Attribute Swizzle Control Mode? */ dw1 = GEN7_SBE_SWIZZLE_ENABLE | num_outputs << GEN7_SBE_NUM_OUTPUTS_SHIFT; /* _NEW_POINT * * Window coordinates in an FBO are inverted, which means point * sprite origin must be inverted. */ if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) != render_to_fbo) { point_sprite_origin = GEN6_SF_POINT_SPRITE_LOWERLEFT; } else { point_sprite_origin = GEN6_SF_POINT_SPRITE_UPPERLEFT; } dw1 |= point_sprite_origin; dw10 = 0; dw11 = 0; /* Create the mapping from the FS inputs we produce to the VS outputs * they source from. */ uint32_t max_source_attr = 0; for (; attr < VARYING_SLOT_MAX; attr++) { enum glsl_interp_qualifier interp_qualifier = brw->fragment_program->InterpQualifier[attr]; bool is_gl_Color = attr == VARYING_SLOT_COL0 || attr == VARYING_SLOT_COL1; if (!(brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(attr))) continue; if (ctx->Point.PointSprite && attr >= VARYING_SLOT_TEX0 && attr <= VARYING_SLOT_TEX7 && ctx->Point.CoordReplace[attr - VARYING_SLOT_TEX0]) { dw10 |= (1 << input_index); } if (attr == VARYING_SLOT_PNTC) dw10 |= (1 << input_index); /* flat shading */ if (interp_qualifier == INTERP_QUALIFIER_FLAT || (shade_model_flat && is_gl_Color && interp_qualifier == INTERP_QUALIFIER_NONE)) dw11 |= (1 << input_index); /* The hardware can only do the overrides on 16 overrides at a * time, and the other up to 16 have to be lined up so that the * input index = the output index. We'll need to do some * tweaking to make sure that's the case. */ assert(input_index < 16 || attr == input_index); /* BRW_NEW_VUE_MAP_GEOM_OUT | _NEW_LIGHT | _NEW_PROGRAM */ attr_overrides[input_index++] = get_attr_override(&brw->vue_map_geom_out, urb_entry_read_offset, attr, ctx->VertexProgram._TwoSideEnabled, &max_source_attr); } /* From the Ivy Bridge PRM, Volume 2, Part 1, documentation for * 3DSTATE_SBE DWord 1 bits 15:11, "Vertex URB Entry Read Length": * * "This field should be set to the minimum length required to read the * maximum source attribute. The maximum source attribute is indicated * by the maximum value of the enabled Attribute # Source Attribute if * Attribute Swizzle Enable is set, Number of Output Attributes-1 if * enable is not set. * * read_length = ceiling((max_source_attr + 1) / 2)" */ uint32_t urb_entry_read_length = ALIGN(max_source_attr + 1, 2) / 2; dw1 |= urb_entry_read_length << GEN7_SBE_URB_ENTRY_READ_LENGTH_SHIFT | urb_entry_read_offset << GEN7_SBE_URB_ENTRY_READ_OFFSET_SHIFT; for (; input_index < VARYING_SLOT_MAX; input_index++) attr_overrides[input_index] = 0; BEGIN_BATCH(14); OUT_BATCH(_3DSTATE_SBE << 16 | (14 - 2)); OUT_BATCH(dw1); /* Output dwords 2 through 9 */ for (i = 0; i < 8; i++) { OUT_BATCH(attr_overrides[i * 2] | attr_overrides[i * 2 + 1] << 16); } OUT_BATCH(dw10); /* point sprite texcoord bitmask */ OUT_BATCH(dw11); /* constant interp bitmask */ OUT_BATCH(0); /* wrapshortest enables 0-7 */ OUT_BATCH(0); /* wrapshortest enables 8-15 */ ADVANCE_BATCH(); }
/** * Copy the back color buffer to the front color buffer. * Used for SwapBuffers(). */ void intelCopyBuffer(const __DRIdrawablePrivate * dPriv, const drm_clip_rect_t * rect) { struct intel_context *intel; const intelScreenPrivate *intelScreen; int ret; DBG("%s\n", __FUNCTION__); assert(dPriv); intel = intelScreenContext(dPriv->driScreenPriv->private); if (!intel) return; intelScreen = intel->intelScreen; if (intel->last_swap_fence) { dri_fence_wait(intel->last_swap_fence); dri_fence_unreference(intel->last_swap_fence); intel->last_swap_fence = NULL; } intel->last_swap_fence = intel->first_swap_fence; intel->first_swap_fence = NULL; /* The LOCK_HARDWARE is required for the cliprects. Buffer offsets * should work regardless. */ LOCK_HARDWARE(intel); if (dPriv && dPriv->numClipRects) { struct intel_framebuffer *intel_fb = dPriv->driverPrivate; struct intel_region *src, *dst; int nbox = dPriv->numClipRects; drm_clip_rect_t *pbox = dPriv->pClipRects; int cpp; int src_pitch, dst_pitch; unsigned short src_x, src_y; int BR13, CMD; int i; src = intel_get_rb_region(&intel_fb->Base, BUFFER_BACK_LEFT); dst = intel_get_rb_region(&intel_fb->Base, BUFFER_FRONT_LEFT); src_pitch = src->pitch * src->cpp; dst_pitch = dst->pitch * dst->cpp; cpp = src->cpp; ASSERT(intel_fb); ASSERT(intel_fb->Base.Name == 0); /* Not a user-created FBO */ ASSERT(src); ASSERT(dst); ASSERT(src->cpp == dst->cpp); if (cpp == 2) { BR13 = (0xCC << 16) | (1 << 24); CMD = XY_SRC_COPY_BLT_CMD; } else { BR13 = (0xCC << 16) | (1 << 24) | (1 << 25); CMD = XY_SRC_COPY_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB; } #ifndef I915 if (src->tiled) { CMD |= XY_SRC_TILED; src_pitch /= 4; } if (dst->tiled) { CMD |= XY_DST_TILED; dst_pitch /= 4; } #endif /* do space/cliprects check before going any further */ intel_batchbuffer_require_space(intel->batch, 8 * 4, REFERENCES_CLIPRECTS); again: ret = dri_bufmgr_check_aperture_space(dst->buffer); ret |= dri_bufmgr_check_aperture_space(src->buffer); if (ret) { intel_batchbuffer_flush(intel->batch); goto again; } for (i = 0; i < nbox; i++, pbox++) { drm_clip_rect_t box = *pbox; if (rect) { if (!intel_intersect_cliprects(&box, &box, rect)) continue; } if (box.x1 >= box.x2 || box.y1 >= box.y2) continue; assert(box.x1 < box.x2); assert(box.y1 < box.y2); src_x = box.x1 - dPriv->x + dPriv->backX; src_y = box.y1 - dPriv->y + dPriv->backY; BEGIN_BATCH(8, REFERENCES_CLIPRECTS); OUT_BATCH(CMD); OUT_BATCH(BR13 | dst_pitch); OUT_BATCH((box.y1 << 16) | box.x1); OUT_BATCH((box.y2 << 16) | box.x2); OUT_RELOC(dst->buffer, DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_WRITE, 0); OUT_BATCH((src_y << 16) | src_x); OUT_BATCH(src_pitch); OUT_RELOC(src->buffer, DRM_BO_FLAG_MEM_TT | DRM_BO_FLAG_READ, 0); ADVANCE_BATCH(); } if (intel->first_swap_fence) dri_fence_unreference(intel->first_swap_fence); intel_batchbuffer_flush(intel->batch); intel->first_swap_fence = intel->batch->last_fence; if (intel->first_swap_fence) dri_fence_reference(intel->first_swap_fence); } UNLOCK_HARDWARE(intel); }
static void gen8_emit_vertices(struct brw_context *brw) { struct gl_context *ctx = &brw->ctx; bool uses_edge_flag; brw_prepare_vertices(brw); brw_prepare_shader_draw_parameters(brw); uses_edge_flag = (ctx->Polygon.FrontMode != GL_FILL || ctx->Polygon.BackMode != GL_FILL); if (brw->vs.prog_data->uses_vertexid || brw->vs.prog_data->uses_instanceid) { unsigned vue = brw->vb.nr_enabled; /* The element for the edge flags must always be last, so we have to * insert the SGVS before it in that case. */ if (uses_edge_flag) { assert(vue > 0); vue--; } WARN_ONCE(vue >= 33, "Trying to insert VID/IID past 33rd vertex element, " "need to reorder the vertex attrbutes."); unsigned dw1 = 0; if (brw->vs.prog_data->uses_vertexid) { dw1 |= GEN8_SGVS_ENABLE_VERTEX_ID | (2 << GEN8_SGVS_VERTEX_ID_COMPONENT_SHIFT) | /* .z channel */ (vue << GEN8_SGVS_VERTEX_ID_ELEMENT_OFFSET_SHIFT); } if (brw->vs.prog_data->uses_instanceid) { dw1 |= GEN8_SGVS_ENABLE_INSTANCE_ID | (3 << GEN8_SGVS_INSTANCE_ID_COMPONENT_SHIFT) | /* .w channel */ (vue << GEN8_SGVS_INSTANCE_ID_ELEMENT_OFFSET_SHIFT); } BEGIN_BATCH(2); OUT_BATCH(_3DSTATE_VF_SGVS << 16 | (2 - 2)); OUT_BATCH(dw1); ADVANCE_BATCH(); BEGIN_BATCH(3); OUT_BATCH(_3DSTATE_VF_INSTANCING << 16 | (3 - 2)); OUT_BATCH(vue | GEN8_VF_INSTANCING_ENABLE); OUT_BATCH(0); ADVANCE_BATCH(); } else { BEGIN_BATCH(2); OUT_BATCH(_3DSTATE_VF_SGVS << 16 | (2 - 2)); OUT_BATCH(0); ADVANCE_BATCH(); } /* If the VS doesn't read any inputs (calculating vertex position from * a state variable for some reason, for example), emit a single pad * VERTEX_ELEMENT struct and bail. * * The stale VB state stays in place, but they don't do anything unless * a VE loads from them. */ if (brw->vb.nr_enabled == 0) { BEGIN_BATCH(3); OUT_BATCH((_3DSTATE_VERTEX_ELEMENTS << 16) | (3 - 2)); OUT_BATCH((0 << GEN6_VE0_INDEX_SHIFT) | GEN6_VE0_VALID | (BRW_SURFACEFORMAT_R32G32B32A32_FLOAT << BRW_VE0_FORMAT_SHIFT) | (0 << BRW_VE0_SRC_OFFSET_SHIFT)); OUT_BATCH((BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_0_SHIFT) | (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) | (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) | (BRW_VE1_COMPONENT_STORE_1_FLT << BRW_VE1_COMPONENT_3_SHIFT)); ADVANCE_BATCH(); return; } /* Now emit 3DSTATE_VERTEX_BUFFERS and 3DSTATE_VERTEX_ELEMENTS packets. */ const bool uses_draw_params = brw->vs.prog_data->uses_basevertex || brw->vs.prog_data->uses_baseinstance; const unsigned nr_buffers = brw->vb.nr_buffers + uses_draw_params + brw->vs.prog_data->uses_drawid; if (nr_buffers) { assert(nr_buffers <= 33); BEGIN_BATCH(1 + 4 * nr_buffers); OUT_BATCH((_3DSTATE_VERTEX_BUFFERS << 16) | (4 * nr_buffers - 1)); for (unsigned i = 0; i < brw->vb.nr_buffers; i++) { const struct brw_vertex_buffer *buffer = &brw->vb.buffers[i]; EMIT_VERTEX_BUFFER_STATE(brw, i, buffer->bo, buffer->offset, buffer->offset + buffer->size, buffer->stride, 0 /* unused */); } if (uses_draw_params) { EMIT_VERTEX_BUFFER_STATE(brw, brw->vb.nr_buffers, brw->draw.draw_params_bo, brw->draw.draw_params_offset, brw->draw.draw_params_bo->size, 0 /* stride */, 0 /* unused */); } if (brw->vs.prog_data->uses_drawid) { EMIT_VERTEX_BUFFER_STATE(brw, brw->vb.nr_buffers + 1, brw->draw.draw_id_bo, brw->draw.draw_id_offset, brw->draw.draw_id_bo->size, 0 /* stride */, 0 /* unused */); } ADVANCE_BATCH(); } /* Normally we don't need an element for the SGVS attribute because the * 3DSTATE_VF_SGVS instruction lets you store the generated attribute in an * element that is past the list in 3DSTATE_VERTEX_ELEMENTS. However if * we're using draw parameters then we need an element for the those * values. Additionally if there is an edge flag element then the SGVS * can't be inserted past that so we need a dummy element to ensure that * the edge flag is the last one. */ const bool needs_sgvs_element = (brw->vs.prog_data->uses_basevertex || brw->vs.prog_data->uses_baseinstance || ((brw->vs.prog_data->uses_instanceid || brw->vs.prog_data->uses_vertexid) && uses_edge_flag)); const unsigned nr_elements = brw->vb.nr_enabled + needs_sgvs_element + brw->vs.prog_data->uses_drawid; /* The hardware allows one more VERTEX_ELEMENTS than VERTEX_BUFFERS, * presumably for VertexID/InstanceID. */ assert(nr_elements <= 34); struct brw_vertex_element *gen6_edgeflag_input = NULL; BEGIN_BATCH(1 + nr_elements * 2); OUT_BATCH((_3DSTATE_VERTEX_ELEMENTS << 16) | (2 * nr_elements - 1)); for (unsigned i = 0; i < brw->vb.nr_enabled; i++) { struct brw_vertex_element *input = brw->vb.enabled[i]; uint32_t format = brw_get_vertex_surface_type(brw, input->glarray); uint32_t comp0 = BRW_VE1_COMPONENT_STORE_SRC; uint32_t comp1 = BRW_VE1_COMPONENT_STORE_SRC; uint32_t comp2 = BRW_VE1_COMPONENT_STORE_SRC; uint32_t comp3 = BRW_VE1_COMPONENT_STORE_SRC; /* From the BDW PRM, Volume 2d, page 588 (VERTEX_ELEMENT_STATE): * "Any SourceElementFormat of *64*_PASSTHRU cannot be used with an * element which has edge flag enabled." */ assert(!(is_passthru_format(format) && uses_edge_flag)); /* The gen4 driver expects edgeflag to come in as a float, and passes * that float on to the tests in the clipper. Mesa's current vertex * attribute value for EdgeFlag is stored as a float, which works out. * glEdgeFlagPointer, on the other hand, gives us an unnormalized * integer ubyte. Just rewrite that to convert to a float. */ if (input == &brw->vb.inputs[VERT_ATTRIB_EDGEFLAG]) { /* Gen6+ passes edgeflag as sideband along with the vertex, instead * of in the VUE. We have to upload it sideband as the last vertex * element according to the B-Spec. */ gen6_edgeflag_input = input; continue; } switch (input->glarray->Size) { case 0: comp0 = BRW_VE1_COMPONENT_STORE_0; case 1: comp1 = BRW_VE1_COMPONENT_STORE_0; case 2: comp2 = BRW_VE1_COMPONENT_STORE_0; case 3: comp3 = input->glarray->Integer ? BRW_VE1_COMPONENT_STORE_1_INT : BRW_VE1_COMPONENT_STORE_1_FLT; break; } /* From the BDW PRM, Volume 2d, page 586 (VERTEX_ELEMENT_STATE): * * "When SourceElementFormat is set to one of the *64*_PASSTHRU * formats, 64-bit components are stored in the URB without any * conversion. In this case, vertex elements must be written as 128 * or 256 bits, with VFCOMP_STORE_0 being used to pad the output * as required. E.g., if R64_PASSTHRU is used to copy a 64-bit Red * component into the URB, Component 1 must be specified as * VFCOMP_STORE_0 (with Components 2,3 set to VFCOMP_NOSTORE) * in order to output a 128-bit vertex element, or Components 1-3 must * be specified as VFCOMP_STORE_0 in order to output a 256-bit vertex * element. Likewise, use of R64G64B64_PASSTHRU requires Component 3 * to be specified as VFCOMP_STORE_0 in order to output a 256-bit vertex * element." */ if (input->glarray->Doubles) { switch (input->glarray->Size) { case 0: case 1: case 2: /* Use 128-bits instead of 256-bits to write double and dvec2 * vertex elements. */ comp2 = BRW_VE1_COMPONENT_NOSTORE; comp3 = BRW_VE1_COMPONENT_NOSTORE; break; case 3: /* Pad the output using VFCOMP_STORE_0 as suggested * by the BDW PRM. */ comp3 = BRW_VE1_COMPONENT_STORE_0; break; } } OUT_BATCH((input->buffer << GEN6_VE0_INDEX_SHIFT) | GEN6_VE0_VALID | (format << BRW_VE0_FORMAT_SHIFT) | (input->offset << BRW_VE0_SRC_OFFSET_SHIFT)); OUT_BATCH((comp0 << BRW_VE1_COMPONENT_0_SHIFT) | (comp1 << BRW_VE1_COMPONENT_1_SHIFT) | (comp2 << BRW_VE1_COMPONENT_2_SHIFT) | (comp3 << BRW_VE1_COMPONENT_3_SHIFT)); } if (needs_sgvs_element) { if (brw->vs.prog_data->uses_basevertex || brw->vs.prog_data->uses_baseinstance) { OUT_BATCH(GEN6_VE0_VALID | brw->vb.nr_buffers << GEN6_VE0_INDEX_SHIFT | BRW_SURFACEFORMAT_R32G32_UINT << BRW_VE0_FORMAT_SHIFT); OUT_BATCH((BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_0_SHIFT) | (BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_1_SHIFT) | (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) | (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_3_SHIFT)); } else { OUT_BATCH(GEN6_VE0_VALID); OUT_BATCH((BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_0_SHIFT) | (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) | (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) | (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_3_SHIFT)); } } if (brw->vs.prog_data->uses_drawid) { OUT_BATCH(GEN6_VE0_VALID | ((brw->vb.nr_buffers + 1) << GEN6_VE0_INDEX_SHIFT) | (BRW_SURFACEFORMAT_R32_UINT << BRW_VE0_FORMAT_SHIFT)); OUT_BATCH((BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_0_SHIFT) | (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) | (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) | (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_3_SHIFT)); } if (gen6_edgeflag_input) { uint32_t format = brw_get_vertex_surface_type(brw, gen6_edgeflag_input->glarray); OUT_BATCH((gen6_edgeflag_input->buffer << GEN6_VE0_INDEX_SHIFT) | GEN6_VE0_VALID | GEN6_VE0_EDGE_FLAG_ENABLE | (format << BRW_VE0_FORMAT_SHIFT) | (gen6_edgeflag_input->offset << BRW_VE0_SRC_OFFSET_SHIFT)); OUT_BATCH((BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_0_SHIFT) | (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) | (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) | (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_3_SHIFT)); } ADVANCE_BATCH(); for (unsigned i = 0, j = 0; i < brw->vb.nr_enabled; i++) { const struct brw_vertex_element *input = brw->vb.enabled[i]; const struct brw_vertex_buffer *buffer = &brw->vb.buffers[input->buffer]; unsigned element_index; /* The edge flag element is reordered to be the last one in the code * above so we need to compensate for that in the element indices used * below. */ if (input == gen6_edgeflag_input) element_index = nr_elements - 1; else element_index = j++; BEGIN_BATCH(3); OUT_BATCH(_3DSTATE_VF_INSTANCING << 16 | (3 - 2)); OUT_BATCH(element_index | (buffer->step_rate ? GEN8_VF_INSTANCING_ENABLE : 0)); OUT_BATCH(buffer->step_rate); ADVANCE_BATCH(); } if (brw->vs.prog_data->uses_drawid) { const unsigned element = brw->vb.nr_enabled + needs_sgvs_element; BEGIN_BATCH(3); OUT_BATCH(_3DSTATE_VF_INSTANCING << 16 | (3 - 2)); OUT_BATCH(element); OUT_BATCH(0); ADVANCE_BATCH(); } }
static void upload_wm_state(struct brw_context *brw) { struct gl_context *ctx = &brw->ctx; const struct brw_fragment_program *fp = brw_fragment_program_const(brw->fragment_program); uint32_t dw2, dw4, dw5, dw6; /* _NEW_BUFFERS */ bool multisampled_fbo = ctx->DrawBuffer->Visual.samples > 1; /* CACHE_NEW_WM_PROG */ if (brw->wm.prog_data->nr_params == 0) { /* Disable the push constant buffers. */ BEGIN_BATCH(5); OUT_BATCH(_3DSTATE_CONSTANT_PS << 16 | (5 - 2)); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } else { BEGIN_BATCH(5); OUT_BATCH(_3DSTATE_CONSTANT_PS << 16 | GEN6_CONSTANT_BUFFER_0_ENABLE | (5 - 2)); /* Pointer to the WM constant buffer. Covered by the set of * state flags from gen6_upload_wm_push_constants. */ OUT_BATCH(brw->wm.push_const_offset + ALIGN(brw->wm.prog_data->nr_params, brw->wm.prog_data->dispatch_width) / 8 - 1); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } dw2 = dw4 = dw5 = dw6 = 0; dw4 |= GEN6_WM_STATISTICS_ENABLE; dw5 |= GEN6_WM_LINE_AA_WIDTH_1_0; dw5 |= GEN6_WM_LINE_END_CAP_AA_WIDTH_0_5; /* Use ALT floating point mode for ARB fragment programs, because they * require 0^0 == 1. Even though _CurrentFragmentProgram is used for * rendering, CurrentFragmentProgram is used for this check to * differentiate between the GLSL and non-GLSL cases. */ if (ctx->Shader.CurrentFragmentProgram == NULL) dw2 |= GEN6_WM_FLOATING_POINT_MODE_ALT; /* CACHE_NEW_SAMPLER */ dw2 |= (ALIGN(brw->sampler.count, 4) / 4) << GEN6_WM_SAMPLER_COUNT_SHIFT; dw4 |= (brw->wm.prog_data->first_curbe_grf << GEN6_WM_DISPATCH_START_GRF_SHIFT_0); dw4 |= (brw->wm.prog_data->first_curbe_grf_16 << GEN6_WM_DISPATCH_START_GRF_SHIFT_2); dw5 |= (brw->max_wm_threads - 1) << GEN6_WM_MAX_THREADS_SHIFT; /* CACHE_NEW_WM_PROG */ dw5 |= GEN6_WM_8_DISPATCH_ENABLE; if (brw->wm.prog_data->prog_offset_16) dw5 |= GEN6_WM_16_DISPATCH_ENABLE; /* CACHE_NEW_WM_PROG | _NEW_COLOR */ if (brw->wm.prog_data->dual_src_blend && (ctx->Color.BlendEnabled & 1) && ctx->Color.Blend[0]._UsesDualSrc) { dw5 |= GEN6_WM_DUAL_SOURCE_BLEND_ENABLE; } /* _NEW_LINE */ if (ctx->Line.StippleFlag) dw5 |= GEN6_WM_LINE_STIPPLE_ENABLE; /* _NEW_POLYGON */ if (ctx->Polygon.StippleFlag) dw5 |= GEN6_WM_POLYGON_STIPPLE_ENABLE; /* BRW_NEW_FRAGMENT_PROGRAM */ if (fp->program.Base.InputsRead & VARYING_BIT_POS) dw5 |= GEN6_WM_USES_SOURCE_DEPTH | GEN6_WM_USES_SOURCE_W; if (fp->program.Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) dw5 |= GEN6_WM_COMPUTED_DEPTH; /* CACHE_NEW_WM_PROG */ dw6 |= brw->wm.prog_data->barycentric_interp_modes << GEN6_WM_BARYCENTRIC_INTERPOLATION_MODE_SHIFT; /* _NEW_COLOR, _NEW_MULTISAMPLE */ if (fp->program.UsesKill || ctx->Color.AlphaEnabled || ctx->Multisample.SampleAlphaToCoverage) dw5 |= GEN6_WM_KILL_ENABLE; if (brw_color_buffer_write_enabled(brw) || dw5 & (GEN6_WM_KILL_ENABLE | GEN6_WM_COMPUTED_DEPTH)) { dw5 |= GEN6_WM_DISPATCH_ENABLE; } dw6 |= _mesa_bitcount_64(brw->fragment_program->Base.InputsRead) << GEN6_WM_NUM_SF_OUTPUTS_SHIFT; if (multisampled_fbo) { /* _NEW_MULTISAMPLE */ if (ctx->Multisample.Enabled) dw6 |= GEN6_WM_MSRAST_ON_PATTERN; else dw6 |= GEN6_WM_MSRAST_OFF_PIXEL; dw6 |= GEN6_WM_MSDISPMODE_PERPIXEL; } else { dw6 |= GEN6_WM_MSRAST_OFF_PIXEL; dw6 |= GEN6_WM_MSDISPMODE_PERSAMPLE; } BEGIN_BATCH(9); OUT_BATCH(_3DSTATE_WM << 16 | (9 - 2)); OUT_BATCH(brw->wm.prog_offset); OUT_BATCH(dw2); if (brw->wm.prog_data->total_scratch) { OUT_RELOC(brw->wm.scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, ffs(brw->wm.prog_data->total_scratch) - 11); } else { OUT_BATCH(0); } OUT_BATCH(dw4); OUT_BATCH(dw5); OUT_BATCH(dw6); OUT_BATCH(0); /* kernel 1 pointer */ /* kernel 2 pointer */ OUT_BATCH(brw->wm.prog_offset + brw->wm.prog_data->prog_offset_16); ADVANCE_BATCH(); }
static void gen8_upload_gs_state(struct brw_context *brw) { struct gl_context *ctx = &brw->ctx; const struct brw_stage_state *stage_state = &brw->gs.base; /* BRW_NEW_GEOMETRY_PROGRAM */ bool active = brw->geometry_program; /* CACHE_NEW_GS_PROG */ const struct brw_vec4_prog_data *prog_data = &brw->gs.prog_data->base; if (active) { int urb_entry_write_offset = 1; uint32_t urb_entry_output_length = ((prog_data->vue_map.num_slots + 1) / 2 - urb_entry_write_offset); if (urb_entry_output_length == 0) urb_entry_output_length = 1; BEGIN_BATCH(10); OUT_BATCH(_3DSTATE_GS << 16 | (10 - 2)); OUT_BATCH(stage_state->prog_offset); OUT_BATCH(0); OUT_BATCH(GEN6_GS_VECTOR_MASK_ENABLE | brw->geometry_program->VerticesIn | ((ALIGN(stage_state->sampler_count, 4)/4) << GEN6_GS_SAMPLER_COUNT_SHIFT) | ((prog_data->base.binding_table.size_bytes / 4) << GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT)); if (brw->gs.prog_data->base.total_scratch) { OUT_RELOC64(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, ffs(brw->gs.prog_data->base.total_scratch) - 11); WARN_ONCE(true, "May need to implement a temporary workaround: GS Number of " "URB Entries must be less than or equal to the GS Maximum " "Number of Threads.\n"); } else { OUT_BATCH(0); OUT_BATCH(0); } /* DW6 */ OUT_BATCH(((brw->gs.prog_data->output_vertex_size_hwords * 2 - 1) << GEN7_GS_OUTPUT_VERTEX_SIZE_SHIFT) | (brw->gs.prog_data->output_topology << GEN7_GS_OUTPUT_TOPOLOGY_SHIFT) | (prog_data->urb_read_length << GEN6_GS_URB_READ_LENGTH_SHIFT) | (0 << GEN6_GS_URB_ENTRY_READ_OFFSET_SHIFT) | (prog_data->dispatch_grf_start_reg << GEN6_GS_DISPATCH_START_GRF_SHIFT)); /* DW7 */ OUT_BATCH(((brw->max_gs_threads / 2 - 1) << HSW_GS_MAX_THREADS_SHIFT) | (brw->gs.prog_data->control_data_header_size_hwords << GEN7_GS_CONTROL_DATA_HEADER_SIZE_SHIFT) | (brw->gs.prog_data->dual_instanced_dispatch ? GEN7_GS_DISPATCH_MODE_DUAL_INSTANCE : GEN7_GS_DISPATCH_MODE_DUAL_OBJECT) | GEN6_GS_STATISTICS_ENABLE | (brw->gs.prog_data->include_primitive_id ? GEN7_GS_INCLUDE_PRIMITIVE_ID : 0) | GEN7_GS_REORDER_TRAILING | GEN7_GS_ENABLE); /* DW8 */ OUT_BATCH(brw->gs.prog_data->control_data_format << HSW_GS_CONTROL_DATA_FORMAT_SHIFT); /* DW9 / _NEW_TRANSFORM */ OUT_BATCH((ctx->Transform.ClipPlanesEnabled << GEN8_GS_USER_CLIP_DISTANCE_SHIFT) | (urb_entry_output_length << GEN8_GS_URB_OUTPUT_LENGTH_SHIFT) | (urb_entry_write_offset << GEN8_GS_URB_ENTRY_OUTPUT_OFFSET_SHIFT)); ADVANCE_BATCH(); } else { BEGIN_BATCH(10); OUT_BATCH(_3DSTATE_GS << 16 | (10 - 2)); OUT_BATCH(0); /* prog_bo */ OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); /* scratch space base offset */ OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(GEN6_GS_STATISTICS_ENABLE); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } }
static void gen7_blorp_emit_depth_stencil_config(struct brw_context *brw, const brw_blorp_params *params) { const uint8_t mocs = GEN7_MOCS_L3; uint32_t surfwidth, surfheight; uint32_t surftype; unsigned int depth = MAX2(params->depth.mt->logical_depth0, 1); unsigned int min_array_element; GLenum gl_target = params->depth.mt->target; unsigned int lod; switch (gl_target) { case GL_TEXTURE_CUBE_MAP_ARRAY: case GL_TEXTURE_CUBE_MAP: /* The PRM claims that we should use BRW_SURFACE_CUBE for this * situation, but experiments show that gl_Layer doesn't work when we do * this. So we use BRW_SURFACE_2D, since for rendering purposes this is * equivalent. */ surftype = BRW_SURFACE_2D; depth *= 6; break; default: surftype = translate_tex_target(gl_target); break; } min_array_element = params->depth.layer; if (params->depth.mt->num_samples > 1) { /* Convert physical layer to logical layer. */ min_array_element /= params->depth.mt->num_samples; } lod = params->depth.level - params->depth.mt->first_level; if (params->hiz_op != GEN6_HIZ_OP_NONE && lod == 0) { /* HIZ ops for lod 0 may set the width & height a little * larger to allow the fast depth clear to fit the hardware * alignment requirements. (8x4) */ surfwidth = params->depth.width; surfheight = params->depth.height; } else { surfwidth = params->depth.mt->logical_width0; surfheight = params->depth.mt->logical_height0; } /* 3DSTATE_DEPTH_BUFFER */ { brw_emit_depth_stall_flushes(brw); BEGIN_BATCH(7); OUT_BATCH(GEN7_3DSTATE_DEPTH_BUFFER << 16 | (7 - 2)); OUT_BATCH((params->depth.mt->pitch - 1) | params->depth_format << 18 | 1 << 22 | /* hiz enable */ 1 << 28 | /* depth write */ surftype << 29); OUT_RELOC(params->depth.mt->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0); OUT_BATCH((surfwidth - 1) << 4 | (surfheight - 1) << 18 | lod); OUT_BATCH(((depth - 1) << 21) | (min_array_element << 10) | mocs); OUT_BATCH(0); OUT_BATCH((depth - 1) << 21); ADVANCE_BATCH(); } /* 3DSTATE_HIER_DEPTH_BUFFER */ { struct intel_miptree_aux_buffer *hiz_buf = params->depth.mt->hiz_buf; BEGIN_BATCH(3); OUT_BATCH((GEN7_3DSTATE_HIER_DEPTH_BUFFER << 16) | (3 - 2)); OUT_BATCH((mocs << 25) | (hiz_buf->pitch - 1)); OUT_RELOC(hiz_buf->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, 0); ADVANCE_BATCH(); } /* 3DSTATE_STENCIL_BUFFER */ { BEGIN_BATCH(3); OUT_BATCH((GEN7_3DSTATE_STENCIL_BUFFER << 16) | (3 - 2)); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } }
/** * Define the base addresses which some state is referenced from. * * This allows us to avoid having to emit relocations for the objects, * and is actually required for binding table pointers on gen6. * * Surface state base address covers binding table pointers and * surface state objects, but not the surfaces that the surface state * objects point to. */ static void upload_state_base_address( struct brw_context *brw ) { /* FINISHME: According to section 3.6.1 "STATE_BASE_ADDRESS" of * vol1a of the G45 PRM, MI_FLUSH with the ISC invalidate should be * programmed prior to STATE_BASE_ADDRESS. * * However, given that the instruction SBA (general state base * address) on this chipset is always set to 0 across X and GL, * maybe this isn't required for us in particular. */ if (brw->gen >= 6) { if (brw->gen == 6) intel_emit_post_sync_nonzero_flush(brw); BEGIN_BATCH(10); OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (10 - 2)); /* General state base address: stateless DP read/write requests */ OUT_BATCH(1); /* Surface state base address: * BINDING_TABLE_STATE * SURFACE_STATE */ OUT_RELOC(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0, 1); /* Dynamic state base address: * SAMPLER_STATE * SAMPLER_BORDER_COLOR_STATE * CLIP, SF, WM/CC viewport state * COLOR_CALC_STATE * DEPTH_STENCIL_STATE * BLEND_STATE * Push constants (when INSTPM: CONSTANT_BUFFER Address Offset * Disable is clear, which we rely on) */ OUT_RELOC(brw->batch.bo, (I915_GEM_DOMAIN_RENDER | I915_GEM_DOMAIN_INSTRUCTION), 0, 1); OUT_BATCH(1); /* Indirect object base address: MEDIA_OBJECT data */ OUT_RELOC(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1); /* Instruction base address: shader kernels (incl. SIP) */ OUT_BATCH(1); /* General state upper bound */ /* Dynamic state upper bound. Although the documentation says that * programming it to zero will cause it to be ignored, that is a lie. * If this isn't programmed to a real bound, the sampler border color * pointer is rejected, causing border color to mysteriously fail. */ OUT_BATCH(0xfffff001); OUT_BATCH(1); /* Indirect object upper bound */ OUT_BATCH(1); /* Instruction access upper bound */ ADVANCE_BATCH(); } else if (brw->gen == 5) { BEGIN_BATCH(8); OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (8 - 2)); OUT_BATCH(1); /* General state base address */ OUT_RELOC(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0, 1); /* Surface state base address */ OUT_BATCH(1); /* Indirect object base address */ OUT_RELOC(brw->cache.bo, I915_GEM_DOMAIN_INSTRUCTION, 0, 1); /* Instruction base address */ OUT_BATCH(0xfffff001); /* General state upper bound */ OUT_BATCH(1); /* Indirect object upper bound */ OUT_BATCH(1); /* Instruction access upper bound */ ADVANCE_BATCH(); } else { BEGIN_BATCH(6); OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (6 - 2)); OUT_BATCH(1); /* General state base address */ OUT_RELOC(brw->batch.bo, I915_GEM_DOMAIN_SAMPLER, 0, 1); /* Surface state base address */ OUT_BATCH(1); /* Indirect object base address */ OUT_BATCH(1); /* General state upper bound */ OUT_BATCH(1); /* Indirect object upper bound */ ADVANCE_BATCH(); } /* According to section 3.6.1 of VOL1 of the 965 PRM, * STATE_BASE_ADDRESS updates require a reissue of: * * 3DSTATE_PIPELINE_POINTERS * 3DSTATE_BINDING_TABLE_POINTERS * MEDIA_STATE_POINTERS * * and this continues through Ironlake. The Sandy Bridge PRM, vol * 1 part 1 says that the folowing packets must be reissued: * * 3DSTATE_CC_POINTERS * 3DSTATE_BINDING_TABLE_POINTERS * 3DSTATE_SAMPLER_STATE_POINTERS * 3DSTATE_VIEWPORT_STATE_POINTERS * MEDIA_STATE_POINTERS * * Those are always reissued following SBA updates anyway (new * batch time), except in the case of the program cache BO * changing. Having a separate state flag makes the sequence more * obvious. */ brw->state.dirty.brw |= BRW_NEW_STATE_BASE_ADDRESS; }
static void upload_wm_state(struct brw_context *brw) { struct intel_context *intel = &brw->intel; struct gl_context *ctx = &intel->ctx; const struct brw_fragment_program *fp = brw_fragment_program_const(brw->fragment_program); bool writes_depth = false; uint32_t dw1, dw2; /* _NEW_BUFFERS */ bool multisampled_fbo = ctx->DrawBuffer->Visual.samples > 1; dw1 = dw2 = 0; dw1 |= GEN7_WM_STATISTICS_ENABLE; dw1 |= GEN7_WM_LINE_AA_WIDTH_1_0; dw1 |= GEN7_WM_LINE_END_CAP_AA_WIDTH_0_5; /* _NEW_LINE */ if (ctx->Line.StippleFlag) dw1 |= GEN7_WM_LINE_STIPPLE_ENABLE; /* _NEW_POLYGON */ if (ctx->Polygon.StippleFlag) dw1 |= GEN7_WM_POLYGON_STIPPLE_ENABLE; /* BRW_NEW_FRAGMENT_PROGRAM */ if (fp->program.Base.InputsRead & VARYING_BIT_POS) dw1 |= GEN7_WM_USES_SOURCE_DEPTH | GEN7_WM_USES_SOURCE_W; if (fp->program.Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) { writes_depth = true; dw1 |= GEN7_WM_PSCDEPTH_ON; } /* CACHE_NEW_WM_PROG */ dw1 |= brw->wm.prog_data->barycentric_interp_modes << GEN7_WM_BARYCENTRIC_INTERPOLATION_MODE_SHIFT; /* _NEW_COLOR, _NEW_MULTISAMPLE */ if (fp->program.UsesKill || ctx->Color.AlphaEnabled || ctx->Multisample.SampleAlphaToCoverage) dw1 |= GEN7_WM_KILL_ENABLE; /* _NEW_BUFFERS */ if (brw_color_buffer_write_enabled(brw) || writes_depth || dw1 & GEN7_WM_KILL_ENABLE) { dw1 |= GEN7_WM_DISPATCH_ENABLE; } if (multisampled_fbo) { /* _NEW_MULTISAMPLE */ if (ctx->Multisample.Enabled) dw1 |= GEN7_WM_MSRAST_ON_PATTERN; else dw1 |= GEN7_WM_MSRAST_OFF_PIXEL; dw2 |= GEN7_WM_MSDISPMODE_PERPIXEL; } else { dw1 |= GEN7_WM_MSRAST_OFF_PIXEL; dw2 |= GEN7_WM_MSDISPMODE_PERSAMPLE; } BEGIN_BATCH(3); OUT_BATCH(_3DSTATE_WM << 16 | (3 - 2)); OUT_BATCH(dw1); OUT_BATCH(dw2); ADVANCE_BATCH(); }
void brw_emit_depth_stencil_hiz(struct brw_context *brw, struct intel_mipmap_tree *depth_mt, uint32_t depth_offset, uint32_t depthbuffer_format, uint32_t depth_surface_type, struct intel_mipmap_tree *stencil_mt, bool hiz, bool separate_stencil, uint32_t width, uint32_t height, uint32_t tile_x, uint32_t tile_y) { /* Enable the hiz bit if we're doing separate stencil, because it and the * separate stencil bit must have the same value. From Section 2.11.5.6.1.1 * 3DSTATE_DEPTH_BUFFER, Bit 1.21 "Separate Stencil Enable": * [DevIL]: If this field is enabled, Hierarchical Depth Buffer * Enable must also be enabled. * * [DevGT]: This field must be set to the same value (enabled or * disabled) as Hierarchical Depth Buffer Enable */ bool enable_hiz_ss = hiz || separate_stencil; /* 3DSTATE_DEPTH_BUFFER, 3DSTATE_STENCIL_BUFFER are both * non-pipelined state that will need the PIPE_CONTROL workaround. */ if (brw->gen == 6) { intel_emit_post_sync_nonzero_flush(brw); intel_emit_depth_stall_flushes(brw); } unsigned int len; if (brw->gen >= 6) len = 7; else if (brw->is_g4x || brw->gen == 5) len = 6; else len = 5; BEGIN_BATCH(len); OUT_BATCH(_3DSTATE_DEPTH_BUFFER << 16 | (len - 2)); OUT_BATCH((depth_mt ? depth_mt->region->pitch - 1 : 0) | (depthbuffer_format << 18) | ((enable_hiz_ss ? 1 : 0) << 21) | /* separate stencil enable */ ((enable_hiz_ss ? 1 : 0) << 22) | /* hiz enable */ (BRW_TILEWALK_YMAJOR << 26) | ((depth_mt ? depth_mt->region->tiling != I915_TILING_NONE : 1) << 27) | (depth_surface_type << 29)); if (depth_mt) { OUT_RELOC(depth_mt->region->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, depth_offset); } else { OUT_BATCH(0); } OUT_BATCH(((width + tile_x - 1) << 6) | ((height + tile_y - 1) << 19)); OUT_BATCH(0); if (brw->is_g4x || brw->gen >= 5) OUT_BATCH(tile_x | (tile_y << 16)); else assert(tile_x == 0 && tile_y == 0); if (brw->gen >= 6) OUT_BATCH(0); ADVANCE_BATCH(); if (hiz || separate_stencil) { /* * In the 3DSTATE_DEPTH_BUFFER batch emitted above, the 'separate * stencil enable' and 'hiz enable' bits were set. Therefore we must * emit 3DSTATE_HIER_DEPTH_BUFFER and 3DSTATE_STENCIL_BUFFER. Even if * there is no stencil buffer, 3DSTATE_STENCIL_BUFFER must be emitted; * failure to do so causes hangs on gen5 and a stall on gen6. */ /* Emit hiz buffer. */ if (hiz) { struct intel_mipmap_tree *hiz_mt = depth_mt->hiz_mt; BEGIN_BATCH(3); OUT_BATCH((_3DSTATE_HIER_DEPTH_BUFFER << 16) | (3 - 2)); OUT_BATCH(hiz_mt->region->pitch - 1); OUT_RELOC(hiz_mt->region->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, brw->depthstencil.hiz_offset); ADVANCE_BATCH(); } else { BEGIN_BATCH(3); OUT_BATCH((_3DSTATE_HIER_DEPTH_BUFFER << 16) | (3 - 2)); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } /* Emit stencil buffer. */ if (separate_stencil) { struct intel_region *region = stencil_mt->region; BEGIN_BATCH(3); OUT_BATCH((_3DSTATE_STENCIL_BUFFER << 16) | (3 - 2)); /* The stencil buffer has quirky pitch requirements. From Vol 2a, * 11.5.6.2.1 3DSTATE_STENCIL_BUFFER, field "Surface Pitch": * The pitch must be set to 2x the value computed based on width, as * the stencil buffer is stored with two rows interleaved. */ OUT_BATCH(2 * region->pitch - 1); OUT_RELOC(region->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, brw->depthstencil.stencil_offset); ADVANCE_BATCH(); } else { BEGIN_BATCH(3); OUT_BATCH((_3DSTATE_STENCIL_BUFFER << 16) | (3 - 2)); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } } /* * On Gen >= 6, emit clear params for safety. If using hiz, then clear * params must be emitted. * * From Section 2.11.5.6.4.1 3DSTATE_CLEAR_PARAMS: * 3DSTATE_CLEAR_PARAMS packet must follow the DEPTH_BUFFER_STATE packet * when HiZ is enabled and the DEPTH_BUFFER_STATE changes. */ if (brw->gen >= 6 || hiz) { if (brw->gen == 6) intel_emit_post_sync_nonzero_flush(brw); BEGIN_BATCH(2); OUT_BATCH(_3DSTATE_CLEAR_PARAMS << 16 | GEN5_DEPTH_CLEAR_VALID | (2 - 2)); OUT_BATCH(depth_mt ? depth_mt->depth_clear_value : 0); ADVANCE_BATCH(); } }
static void emit_depthbuffer(struct brw_context *brw) { struct intel_context *intel = &brw->intel; struct gl_context *ctx = &intel->ctx; struct gl_framebuffer *fb = ctx->DrawBuffer; /* _NEW_BUFFERS */ struct intel_renderbuffer *depth_irb = intel_get_renderbuffer(fb, BUFFER_DEPTH); struct intel_renderbuffer *stencil_irb = intel_get_renderbuffer(fb, BUFFER_STENCIL); struct intel_mipmap_tree *depth_mt = brw->depthstencil.depth_mt; struct intel_mipmap_tree *stencil_mt = brw->depthstencil.stencil_mt; struct intel_mipmap_tree *hiz_mt = brw->depthstencil.hiz_mt; uint32_t tile_x = brw->depthstencil.tile_x; uint32_t tile_y = brw->depthstencil.tile_y; unsigned int len; bool separate_stencil = false; if (stencil_mt && stencil_mt->format == MESA_FORMAT_S8) separate_stencil = true; /* 3DSTATE_DEPTH_BUFFER, 3DSTATE_STENCIL_BUFFER are both * non-pipelined state that will need the PIPE_CONTROL workaround. */ if (intel->gen == 6) { intel_emit_post_sync_nonzero_flush(intel); intel_emit_depth_stall_flushes(intel); } /* If there's a packed depth/stencil bound to stencil only, we need to * emit the packed depth/stencil buffer packet. */ if (!depth_irb && stencil_irb && !separate_stencil) { depth_irb = stencil_irb; depth_mt = stencil_mt; } if (intel->gen >= 6) len = 7; else if (intel->is_g4x || intel->gen == 5) len = 6; else len = 5; if (!depth_irb && !separate_stencil) { BEGIN_BATCH(len); OUT_BATCH(_3DSTATE_DEPTH_BUFFER << 16 | (len - 2)); OUT_BATCH((BRW_DEPTHFORMAT_D32_FLOAT << 18) | (BRW_SURFACE_NULL << 29)); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); if (intel->is_g4x || intel->gen >= 5) OUT_BATCH(0); if (intel->gen >= 6) OUT_BATCH(0); ADVANCE_BATCH(); } else if (!depth_irb && separate_stencil) { /* * There exists a separate stencil buffer but no depth buffer. * * The stencil buffer inherits most of its fields from * 3DSTATE_DEPTH_BUFFER: namely the tile walk, surface type, width, and * height. * * Enable the hiz bit because it and the separate stencil bit must have * the same value. From Section 2.11.5.6.1.1 3DSTATE_DEPTH_BUFFER, Bit * 1.21 "Separate Stencil Enable": * [DevIL]: If this field is enabled, Hierarchical Depth Buffer * Enable must also be enabled. * * [DevGT]: This field must be set to the same value (enabled or * disabled) as Hierarchical Depth Buffer Enable * * The tiled bit must be set. From the Sandybridge PRM, Volume 2, Part 1, * Section 7.5.5.1.1 3DSTATE_DEPTH_BUFFER, Bit 1.27 Tiled Surface: * [DevGT+]: This field must be set to TRUE. */ assert(intel->has_separate_stencil); BEGIN_BATCH(len); OUT_BATCH(_3DSTATE_DEPTH_BUFFER << 16 | (len - 2)); OUT_BATCH((BRW_DEPTHFORMAT_D32_FLOAT << 18) | (1 << 21) | /* separate stencil enable */ (1 << 22) | /* hiz enable */ (BRW_TILEWALK_YMAJOR << 26) | (1 << 27) | /* tiled surface */ (BRW_SURFACE_2D << 29)); OUT_BATCH(0); OUT_BATCH(((stencil_irb->Base.Base.Width + tile_x - 1) << 6) | (stencil_irb->Base.Base.Height + tile_y - 1) << 19); OUT_BATCH(0); if (intel->is_g4x || intel->gen >= 5) OUT_BATCH(tile_x | (tile_y << 16)); else assert(tile_x == 0 && tile_y == 0); if (intel->gen >= 6) OUT_BATCH(0); ADVANCE_BATCH(); } else { struct intel_region *region = depth_mt->region; /* If using separate stencil, hiz must be enabled. */ assert(!separate_stencil || hiz_mt); assert(intel->gen < 6 || region->tiling == I915_TILING_Y); assert(!hiz_mt || region->tiling == I915_TILING_Y); BEGIN_BATCH(len); OUT_BATCH(_3DSTATE_DEPTH_BUFFER << 16 | (len - 2)); OUT_BATCH((region->pitch - 1) | (brw_depthbuffer_format(brw) << 18) | ((hiz_mt ? 1 : 0) << 21) | /* separate stencil enable */ ((hiz_mt ? 1 : 0) << 22) | /* hiz enable */ (BRW_TILEWALK_YMAJOR << 26) | ((region->tiling != I915_TILING_NONE) << 27) | (BRW_SURFACE_2D << 29)); OUT_RELOC(region->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, brw->depthstencil.depth_offset); OUT_BATCH((BRW_SURFACE_MIPMAPLAYOUT_BELOW << 1) | (((depth_irb->Base.Base.Width + tile_x) - 1) << 6) | (((depth_irb->Base.Base.Height + tile_y) - 1) << 19)); OUT_BATCH(0); if (intel->is_g4x || intel->gen >= 5) OUT_BATCH(tile_x | (tile_y << 16)); else assert(tile_x == 0 && tile_y == 0); if (intel->gen >= 6) OUT_BATCH(0); ADVANCE_BATCH(); } if (hiz_mt || separate_stencil) { /* * In the 3DSTATE_DEPTH_BUFFER batch emitted above, the 'separate * stencil enable' and 'hiz enable' bits were set. Therefore we must * emit 3DSTATE_HIER_DEPTH_BUFFER and 3DSTATE_STENCIL_BUFFER. Even if * there is no stencil buffer, 3DSTATE_STENCIL_BUFFER must be emitted; * failure to do so causes hangs on gen5 and a stall on gen6. */ /* Emit hiz buffer. */ if (hiz_mt) { BEGIN_BATCH(3); OUT_BATCH((_3DSTATE_HIER_DEPTH_BUFFER << 16) | (3 - 2)); OUT_BATCH(hiz_mt->region->pitch - 1); OUT_RELOC(hiz_mt->region->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, brw->depthstencil.hiz_offset); ADVANCE_BATCH(); } else { BEGIN_BATCH(3); OUT_BATCH((_3DSTATE_HIER_DEPTH_BUFFER << 16) | (3 - 2)); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } /* Emit stencil buffer. */ if (separate_stencil) { struct intel_region *region = stencil_mt->region; BEGIN_BATCH(3); OUT_BATCH((_3DSTATE_STENCIL_BUFFER << 16) | (3 - 2)); /* The stencil buffer has quirky pitch requirements. From Vol 2a, * 11.5.6.2.1 3DSTATE_STENCIL_BUFFER, field "Surface Pitch": * The pitch must be set to 2x the value computed based on width, as * the stencil buffer is stored with two rows interleaved. */ OUT_BATCH(2 * region->pitch - 1); OUT_RELOC(region->bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, brw->depthstencil.stencil_offset); ADVANCE_BATCH(); } else { BEGIN_BATCH(3); OUT_BATCH((_3DSTATE_STENCIL_BUFFER << 16) | (3 - 2)); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } } /* * On Gen >= 6, emit clear params for safety. If using hiz, then clear * params must be emitted. * * From Section 2.11.5.6.4.1 3DSTATE_CLEAR_PARAMS: * 3DSTATE_CLEAR_PARAMS packet must follow the DEPTH_BUFFER_STATE packet * when HiZ is enabled and the DEPTH_BUFFER_STATE changes. */ if (intel->gen >= 6 || hiz_mt) { if (intel->gen == 6) intel_emit_post_sync_nonzero_flush(intel); BEGIN_BATCH(2); OUT_BATCH(_3DSTATE_CLEAR_PARAMS << 16 | GEN5_DEPTH_CLEAR_VALID | (2 - 2)); OUT_BATCH(depth_irb ? depth_irb->mt->depth_clear_value : 0); ADVANCE_BATCH(); } }
static void brw_emit_vertices(struct brw_context *brw) { GLuint i; brw_prepare_vertices(brw); brw_prepare_shader_draw_parameters(brw); brw_emit_query_begin(brw); unsigned nr_elements = brw->vb.nr_enabled; if (brw->vs.prog_data->uses_vertexid || brw->vs.prog_data->uses_instanceid) ++nr_elements; /* If the VS doesn't read any inputs (calculating vertex position from * a state variable for some reason, for example), emit a single pad * VERTEX_ELEMENT struct and bail. * * The stale VB state stays in place, but they don't do anything unless * a VE loads from them. */ if (nr_elements == 0) { BEGIN_BATCH(3); OUT_BATCH((_3DSTATE_VERTEX_ELEMENTS << 16) | 1); if (brw->gen >= 6) { OUT_BATCH((0 << GEN6_VE0_INDEX_SHIFT) | GEN6_VE0_VALID | (BRW_SURFACEFORMAT_R32G32B32A32_FLOAT << BRW_VE0_FORMAT_SHIFT) | (0 << BRW_VE0_SRC_OFFSET_SHIFT)); } else { OUT_BATCH((0 << BRW_VE0_INDEX_SHIFT) | BRW_VE0_VALID | (BRW_SURFACEFORMAT_R32G32B32A32_FLOAT << BRW_VE0_FORMAT_SHIFT) | (0 << BRW_VE0_SRC_OFFSET_SHIFT)); } OUT_BATCH((BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_0_SHIFT) | (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) | (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) | (BRW_VE1_COMPONENT_STORE_1_FLT << BRW_VE1_COMPONENT_3_SHIFT)); ADVANCE_BATCH(); return; } /* Now emit VB and VEP state packets. */ unsigned nr_buffers = brw->vb.nr_buffers + brw->vs.prog_data->uses_vertexid; if (nr_buffers) { if (brw->gen >= 6) { assert(nr_buffers <= 33); } else { assert(nr_buffers <= 17); } BEGIN_BATCH(1 + 4 * nr_buffers); OUT_BATCH((_3DSTATE_VERTEX_BUFFERS << 16) | (4 * nr_buffers - 1)); for (i = 0; i < brw->vb.nr_buffers; i++) { struct brw_vertex_buffer *buffer = &brw->vb.buffers[i]; EMIT_VERTEX_BUFFER_STATE(brw, i, buffer->bo, buffer->bo->size - 1, buffer->offset, buffer->stride, buffer->step_rate); } if (brw->vs.prog_data->uses_vertexid) { EMIT_VERTEX_BUFFER_STATE(brw, brw->vb.nr_buffers, brw->draw.draw_params_bo, brw->draw.draw_params_bo->size - 1, brw->draw.draw_params_offset, 0, /* stride */ 0); /* step rate */ } ADVANCE_BATCH(); } /* The hardware allows one more VERTEX_ELEMENTS than VERTEX_BUFFERS, presumably * for VertexID/InstanceID. */ if (brw->gen >= 6) { assert(nr_elements <= 34); } else { assert(nr_elements <= 18); } struct brw_vertex_element *gen6_edgeflag_input = NULL; BEGIN_BATCH(1 + nr_elements * 2); OUT_BATCH((_3DSTATE_VERTEX_ELEMENTS << 16) | (2 * nr_elements - 1)); for (i = 0; i < brw->vb.nr_enabled; i++) { struct brw_vertex_element *input = brw->vb.enabled[i]; uint32_t format = brw_get_vertex_surface_type(brw, input->glarray); uint32_t comp0 = BRW_VE1_COMPONENT_STORE_SRC; uint32_t comp1 = BRW_VE1_COMPONENT_STORE_SRC; uint32_t comp2 = BRW_VE1_COMPONENT_STORE_SRC; uint32_t comp3 = BRW_VE1_COMPONENT_STORE_SRC; if (input == &brw->vb.inputs[VERT_ATTRIB_EDGEFLAG]) { /* Gen6+ passes edgeflag as sideband along with the vertex, instead * of in the VUE. We have to upload it sideband as the last vertex * element according to the B-Spec. */ if (brw->gen >= 6) { gen6_edgeflag_input = input; continue; } } switch (input->glarray->Size) { case 0: comp0 = BRW_VE1_COMPONENT_STORE_0; case 1: comp1 = BRW_VE1_COMPONENT_STORE_0; case 2: comp2 = BRW_VE1_COMPONENT_STORE_0; case 3: comp3 = input->glarray->Integer ? BRW_VE1_COMPONENT_STORE_1_INT : BRW_VE1_COMPONENT_STORE_1_FLT; break; } if (brw->gen >= 6) { OUT_BATCH((input->buffer << GEN6_VE0_INDEX_SHIFT) | GEN6_VE0_VALID | (format << BRW_VE0_FORMAT_SHIFT) | (input->offset << BRW_VE0_SRC_OFFSET_SHIFT)); } else { OUT_BATCH((input->buffer << BRW_VE0_INDEX_SHIFT) | BRW_VE0_VALID | (format << BRW_VE0_FORMAT_SHIFT) | (input->offset << BRW_VE0_SRC_OFFSET_SHIFT)); } if (brw->gen >= 5) OUT_BATCH((comp0 << BRW_VE1_COMPONENT_0_SHIFT) | (comp1 << BRW_VE1_COMPONENT_1_SHIFT) | (comp2 << BRW_VE1_COMPONENT_2_SHIFT) | (comp3 << BRW_VE1_COMPONENT_3_SHIFT)); else OUT_BATCH((comp0 << BRW_VE1_COMPONENT_0_SHIFT) | (comp1 << BRW_VE1_COMPONENT_1_SHIFT) | (comp2 << BRW_VE1_COMPONENT_2_SHIFT) | (comp3 << BRW_VE1_COMPONENT_3_SHIFT) | ((i * 4) << BRW_VE1_DST_OFFSET_SHIFT)); } if (brw->gen >= 6 && gen6_edgeflag_input) { uint32_t format = brw_get_vertex_surface_type(brw, gen6_edgeflag_input->glarray); OUT_BATCH((gen6_edgeflag_input->buffer << GEN6_VE0_INDEX_SHIFT) | GEN6_VE0_VALID | GEN6_VE0_EDGE_FLAG_ENABLE | (format << BRW_VE0_FORMAT_SHIFT) | (gen6_edgeflag_input->offset << BRW_VE0_SRC_OFFSET_SHIFT)); OUT_BATCH((BRW_VE1_COMPONENT_STORE_SRC << BRW_VE1_COMPONENT_0_SHIFT) | (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_1_SHIFT) | (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_2_SHIFT) | (BRW_VE1_COMPONENT_STORE_0 << BRW_VE1_COMPONENT_3_SHIFT)); } if (brw->vs.prog_data->uses_vertexid || brw->vs.prog_data->uses_instanceid) { uint32_t dw0 = 0, dw1 = 0; uint32_t comp0 = BRW_VE1_COMPONENT_STORE_0; uint32_t comp1 = BRW_VE1_COMPONENT_STORE_0; uint32_t comp2 = BRW_VE1_COMPONENT_STORE_0; uint32_t comp3 = BRW_VE1_COMPONENT_STORE_0; if (brw->vs.prog_data->uses_vertexid) { comp0 = BRW_VE1_COMPONENT_STORE_SRC; comp2 = BRW_VE1_COMPONENT_STORE_VID; } if (brw->vs.prog_data->uses_instanceid) { comp3 = BRW_VE1_COMPONENT_STORE_IID; } dw1 = (comp0 << BRW_VE1_COMPONENT_0_SHIFT) | (comp1 << BRW_VE1_COMPONENT_1_SHIFT) | (comp2 << BRW_VE1_COMPONENT_2_SHIFT) | (comp3 << BRW_VE1_COMPONENT_3_SHIFT); if (brw->gen >= 6) { dw0 |= GEN6_VE0_VALID | brw->vb.nr_buffers << GEN6_VE0_INDEX_SHIFT | BRW_SURFACEFORMAT_R32_UINT << BRW_VE0_FORMAT_SHIFT; } else { dw0 |= BRW_VE0_VALID | brw->vb.nr_buffers << BRW_VE0_INDEX_SHIFT | BRW_SURFACEFORMAT_R32_UINT << BRW_VE0_FORMAT_SHIFT; dw1 |= (i * 4) << BRW_VE1_DST_OFFSET_SHIFT; } /* Note that for gl_VertexID, gl_InstanceID, and gl_PrimitiveID values, * the format is ignored and the value is always int. */ OUT_BATCH(dw0); OUT_BATCH(dw1); } ADVANCE_BATCH(); }