static inline void brw_upload_programs(struct brw_context *brw, enum brw_pipeline pipeline) { struct gl_context *ctx = &brw->ctx; const struct gen_device_info *devinfo = &brw->screen->devinfo; if (pipeline == BRW_RENDER_PIPELINE) { brw_upload_vs_prog(brw); brw_upload_tess_programs(brw); if (brw->programs[MESA_SHADER_GEOMETRY]) { brw_upload_gs_prog(brw); } else { brw->gs.base.prog_data = NULL; if (devinfo->gen < 7) brw_upload_ff_gs_prog(brw); } /* Update the VUE map for data exiting the GS stage of the pipeline. * This comes from the last enabled shader stage. */ GLbitfield64 old_slots = brw->vue_map_geom_out.slots_valid; bool old_separate = brw->vue_map_geom_out.separate; struct brw_vue_prog_data *vue_prog_data; if (brw->programs[MESA_SHADER_GEOMETRY]) vue_prog_data = brw_vue_prog_data(brw->gs.base.prog_data); else if (brw->programs[MESA_SHADER_TESS_EVAL]) vue_prog_data = brw_vue_prog_data(brw->tes.base.prog_data); else vue_prog_data = brw_vue_prog_data(brw->vs.base.prog_data); brw->vue_map_geom_out = vue_prog_data->vue_map; /* If the layout has changed, signal BRW_NEW_VUE_MAP_GEOM_OUT. */ if (old_slots != brw->vue_map_geom_out.slots_valid || old_separate != brw->vue_map_geom_out.separate) brw->ctx.NewDriverState |= BRW_NEW_VUE_MAP_GEOM_OUT; if ((old_slots ^ brw->vue_map_geom_out.slots_valid) & VARYING_BIT_VIEWPORT) { ctx->NewDriverState |= BRW_NEW_VIEWPORT_COUNT; brw->clip.viewport_count = (brw->vue_map_geom_out.slots_valid & VARYING_BIT_VIEWPORT) ? ctx->Const.MaxViewports : 1; } brw_upload_wm_prog(brw); if (devinfo->gen < 6) { brw_upload_clip_prog(brw); brw_upload_sf_prog(brw); } brw_disk_cache_write_render_programs(brw); } else if (pipeline == BRW_COMPUTE_PIPELINE) { brw_upload_cs_prog(brw); brw_disk_cache_write_compute_program(brw); } }
static void upload_urb(struct brw_context *brw) { /* BRW_NEW_VS_PROG_DATA */ const struct brw_vue_prog_data *vs_vue_prog_data = brw_vue_prog_data(brw->vs.base.prog_data); const unsigned vs_size = MAX2(vs_vue_prog_data->urb_entry_size, 1); /* BRW_NEW_GEOMETRY_PROGRAM, BRW_NEW_GS_PROG_DATA */ const bool gs_present = brw->ff_gs.prog_active || brw->geometry_program; /* Whe using GS to do transform feedback only we use the same VUE layout for * VS outputs and GS outputs (as it's what the SF and Clipper expect), so we * can simply make the GS URB entry size the same as for the VS. This may * technically be too large in cases where we have few vertex attributes and * a lot of varyings, since the VS size is determined by the larger of the * two. For now, it's safe. * * For user-provided GS the assumption above does not hold since the GS * outputs can be different from the VS outputs. */ unsigned gs_size = vs_size; if (brw->geometry_program) { const struct brw_vue_prog_data *gs_vue_prog_data = brw_vue_prog_data(brw->gs.base.prog_data); gs_size = gs_vue_prog_data->urb_entry_size; assert(gs_size >= 1); } gen6_upload_urb(brw, vs_size, gs_present, gs_size); }
static inline void brw_upload_programs(struct brw_context *brw, enum brw_pipeline pipeline) { struct gl_context *ctx = &brw->ctx; if (pipeline == BRW_RENDER_PIPELINE) { brw_upload_vs_prog(brw); brw_upload_tess_programs(brw); if (brw->gen < 6) brw_upload_ff_gs_prog(brw); else brw_upload_gs_prog(brw); /* Update the VUE map for data exiting the GS stage of the pipeline. * This comes from the last enabled shader stage. */ GLbitfield64 old_slots = brw->vue_map_geom_out.slots_valid; bool old_separate = brw->vue_map_geom_out.separate; struct brw_vue_prog_data *vue_prog_data; if (brw->geometry_program) vue_prog_data = brw_vue_prog_data(brw->gs.base.prog_data); else if (brw->tess_eval_program) vue_prog_data = brw_vue_prog_data(brw->tes.base.prog_data); else vue_prog_data = brw_vue_prog_data(brw->vs.base.prog_data); brw->vue_map_geom_out = vue_prog_data->vue_map; /* If the layout has changed, signal BRW_NEW_VUE_MAP_GEOM_OUT. */ if (old_slots != brw->vue_map_geom_out.slots_valid || old_separate != brw->vue_map_geom_out.separate) brw->ctx.NewDriverState |= BRW_NEW_VUE_MAP_GEOM_OUT; if ((old_slots ^ brw->vue_map_geom_out.slots_valid) & VARYING_BIT_VIEWPORT) { ctx->NewDriverState |= BRW_NEW_VIEWPORT_COUNT; brw->clip.viewport_count = (brw->vue_map_geom_out.slots_valid & VARYING_BIT_VIEWPORT) ? ctx->Const.MaxViewports : 1; } brw_upload_wm_prog(brw); if (brw->gen < 6) { brw_upload_clip_prog(brw); brw_upload_sf_prog(brw); } } else if (pipeline == BRW_COMPUTE_PIPELINE) { brw_upload_cs_prog(brw); } }
static void gen7_upload_ds_state(struct brw_context *brw) { const struct gen_device_info *devinfo = &brw->screen->devinfo; const struct brw_stage_state *stage_state = &brw->tes.base; /* BRW_NEW_TESS_PROGRAMS */ bool active = brw->tess_eval_program; /* BRW_NEW_TES_PROG_DATA */ const struct brw_stage_prog_data *prog_data = stage_state->prog_data; const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(stage_state->prog_data); const struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(stage_state->prog_data); const unsigned thread_count = (devinfo->max_tes_threads - 1) << (brw->is_haswell ? HSW_DS_MAX_THREADS_SHIFT : GEN7_DS_MAX_THREADS_SHIFT); if (active) { BEGIN_BATCH(6); OUT_BATCH(_3DSTATE_DS << 16 | (6 - 2)); OUT_BATCH(stage_state->prog_offset); OUT_BATCH(SET_FIELD(DIV_ROUND_UP(stage_state->sampler_count, 4), GEN7_DS_SAMPLER_COUNT) | SET_FIELD(prog_data->binding_table.size_bytes / 4, GEN7_DS_BINDING_TABLE_ENTRY_COUNT)); if (prog_data->total_scratch) { OUT_RELOC(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, ffs(stage_state->per_thread_scratch) - 11); } else { OUT_BATCH(0); } OUT_BATCH(SET_FIELD(prog_data->dispatch_grf_start_reg, GEN7_DS_DISPATCH_START_GRF) | SET_FIELD(vue_prog_data->urb_read_length, GEN7_DS_URB_READ_LENGTH)); OUT_BATCH(GEN7_DS_ENABLE | GEN7_DS_STATISTICS_ENABLE | thread_count | (tes_prog_data->domain == BRW_TESS_DOMAIN_TRI ? GEN7_DS_COMPUTE_W_COORDINATE_ENABLE : 0)); ADVANCE_BATCH(); } else { BEGIN_BATCH(6); OUT_BATCH(_3DSTATE_DS << 16 | (6 - 2)); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } brw->tes.enabled = active; }
static void upload_gs_state(struct brw_context *brw) { const struct gen_device_info *devinfo = &brw->screen->devinfo; const struct brw_stage_state *stage_state = &brw->gs.base; const int max_threads_shift = brw->is_haswell ? HSW_GS_MAX_THREADS_SHIFT : GEN6_GS_MAX_THREADS_SHIFT; /* BRW_NEW_GEOMETRY_PROGRAM */ bool active = brw->geometry_program; /* BRW_NEW_GS_PROG_DATA */ const struct brw_stage_prog_data *prog_data = stage_state->prog_data; const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(stage_state->prog_data); const struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(stage_state->prog_data); /** * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages > * Geometry > Geometry Shader > State: * * "Note: Because of corruption in IVB:GT2, software needs to flush the * whole fixed function pipeline when the GS enable changes value in * the 3DSTATE_GS." * * The hardware architects have clarified that in this context "flush the * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS * Stall" bit set. */ if (!brw->is_haswell && brw->gt == 2 && brw->gs.enabled != active) gen7_emit_cs_stall_flush(brw); if (active) { BEGIN_BATCH(7); OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2)); OUT_BATCH(stage_state->prog_offset); OUT_BATCH(((ALIGN(stage_state->sampler_count, 4)/4) << GEN6_GS_SAMPLER_COUNT_SHIFT) | ((prog_data->binding_table.size_bytes / 4) << GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT)); if (prog_data->total_scratch) { OUT_RELOC(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, ffs(stage_state->per_thread_scratch) - 11); } else { OUT_BATCH(0); } uint32_t dw4 = ((gs_prog_data->output_vertex_size_hwords * 2 - 1) << GEN7_GS_OUTPUT_VERTEX_SIZE_SHIFT) | (gs_prog_data->output_topology << GEN7_GS_OUTPUT_TOPOLOGY_SHIFT) | (vue_prog_data->urb_read_length << GEN6_GS_URB_READ_LENGTH_SHIFT) | (0 << GEN6_GS_URB_ENTRY_READ_OFFSET_SHIFT) | (prog_data->dispatch_grf_start_reg << GEN6_GS_DISPATCH_START_GRF_SHIFT); /* Note: the meaning of the GEN7_GS_REORDER_TRAILING bit changes between * Ivy Bridge and Haswell. * * On Ivy Bridge, setting this bit causes the vertices of a triangle * strip to be delivered to the geometry shader in an order that does * not strictly follow the OpenGL spec, but preserves triangle * orientation. For example, if the vertices are (1, 2, 3, 4, 5), then * the geometry shader sees triangles: * * (1, 2, 3), (2, 4, 3), (3, 4, 5) * * (Clearing the bit is even worse, because it fails to preserve * orientation). * * Triangle strips with adjacency always ordered in a way that preserves * triangle orientation but does not strictly follow the OpenGL spec, * regardless of the setting of this bit. * * On Haswell, both triangle strips and triangle strips with adjacency * are always ordered in a way that preserves triangle orientation. * Setting this bit causes the ordering to strictly follow the OpenGL * spec. * * So in either case we want to set the bit. Unfortunately on Ivy * Bridge this will get the order close to correct but not perfect. */ uint32_t dw5 = ((devinfo->max_gs_threads - 1) << max_threads_shift) | (gs_prog_data->control_data_header_size_hwords << GEN7_GS_CONTROL_DATA_HEADER_SIZE_SHIFT) | ((gs_prog_data->invocations - 1) << GEN7_GS_INSTANCE_CONTROL_SHIFT) | SET_FIELD(vue_prog_data->dispatch_mode, GEN7_GS_DISPATCH_MODE) | GEN6_GS_STATISTICS_ENABLE | (gs_prog_data->include_primitive_id ? GEN7_GS_INCLUDE_PRIMITIVE_ID : 0) | GEN7_GS_REORDER_TRAILING | GEN7_GS_ENABLE; uint32_t dw6 = 0; if (brw->is_haswell) { dw6 |= gs_prog_data->control_data_format << HSW_GS_CONTROL_DATA_FORMAT_SHIFT; } else { dw5 |= gs_prog_data->control_data_format << IVB_GS_CONTROL_DATA_FORMAT_SHIFT; } OUT_BATCH(dw4); OUT_BATCH(dw5); OUT_BATCH(dw6); ADVANCE_BATCH(); } else { BEGIN_BATCH(7); OUT_BATCH(_3DSTATE_GS << 16 | (7 - 2)); OUT_BATCH(0); /* prog_bo */ OUT_BATCH((0 << GEN6_GS_SAMPLER_COUNT_SHIFT) | (0 << GEN6_GS_BINDING_TABLE_ENTRY_COUNT_SHIFT)); OUT_BATCH(0); /* scratch space base offset */ OUT_BATCH((1 << GEN6_GS_DISPATCH_START_GRF_SHIFT) | (0 << GEN6_GS_URB_READ_LENGTH_SHIFT) | GEN7_GS_INCLUDE_VERTEX_HANDLES | (0 << GEN6_GS_URB_ENTRY_READ_OFFSET_SHIFT)); OUT_BATCH((0 << GEN6_GS_MAX_THREADS_SHIFT) | GEN6_GS_STATISTICS_ENABLE); OUT_BATCH(0); ADVANCE_BATCH(); } brw->gs.enabled = active; }
void brw_codegen_ff_gs_prog(struct brw_context *brw, struct brw_ff_gs_prog_key *key) { const struct gen_device_info *devinfo = &brw->screen->devinfo; struct brw_ff_gs_compile c; const GLuint *program; void *mem_ctx; GLuint program_size; memset(&c, 0, sizeof(c)); c.key = *key; c.vue_map = brw_vue_prog_data(brw->vs.base.prog_data)->vue_map; c.nr_regs = (c.vue_map.num_slots + 1)/2; mem_ctx = ralloc_context(NULL); /* Begin the compilation: */ brw_init_codegen(&brw->screen->devinfo, &c.func, mem_ctx); c.func.single_program_flow = 1; /* For some reason the thread is spawned with only 4 channels * unmasked. */ brw_set_default_mask_control(&c.func, BRW_MASK_DISABLE); if (devinfo->gen >= 6) { unsigned num_verts; bool check_edge_flag; /* On Sandybridge, we use the GS for implementing transform feedback * (called "Stream Out" in the PRM). */ switch (key->primitive) { case _3DPRIM_POINTLIST: num_verts = 1; check_edge_flag = false; break; case _3DPRIM_LINELIST: case _3DPRIM_LINESTRIP: case _3DPRIM_LINELOOP: num_verts = 2; check_edge_flag = false; break; case _3DPRIM_TRILIST: case _3DPRIM_TRIFAN: case _3DPRIM_TRISTRIP: case _3DPRIM_RECTLIST: num_verts = 3; check_edge_flag = false; break; case _3DPRIM_QUADLIST: case _3DPRIM_QUADSTRIP: case _3DPRIM_POLYGON: num_verts = 3; check_edge_flag = true; break; default: unreachable("Unexpected primitive type in Gen6 SOL program."); } gen6_sol_program(&c, key, num_verts, check_edge_flag); } else { /* On Gen4-5, we use the GS to decompose certain types of primitives. * Note that primitives which don't require a GS program have already * been weeded out by now. */ switch (key->primitive) { case _3DPRIM_QUADLIST: brw_ff_gs_quads( &c, key ); break; case _3DPRIM_QUADSTRIP: brw_ff_gs_quad_strip( &c, key ); break; case _3DPRIM_LINELOOP: brw_ff_gs_lines( &c ); break; default: ralloc_free(mem_ctx); return; } } brw_compact_instructions(&c.func, 0, NULL); /* get the program */ program = brw_get_program(&c.func, &program_size); if (unlikely(INTEL_DEBUG & DEBUG_GS)) { fprintf(stderr, "gs:\n"); brw_disassemble(&brw->screen->devinfo, c.func.store, 0, program_size, stderr); fprintf(stderr, "\n"); } brw_upload_cache(&brw->cache, BRW_CACHE_FF_GS_PROG, &c.key, sizeof(c.key), program, program_size, &c.prog_data, sizeof(c.prog_data), &brw->ff_gs.prog_offset, &brw->ff_gs.prog_data); ralloc_free(mem_ctx); }
static void brw_ff_gs_populate_key(struct brw_context *brw, struct brw_ff_gs_prog_key *key) { const struct gen_device_info *devinfo = &brw->screen->devinfo; static const unsigned swizzle_for_offset[4] = { BRW_SWIZZLE4(0, 1, 2, 3), BRW_SWIZZLE4(1, 2, 3, 3), BRW_SWIZZLE4(2, 3, 3, 3), BRW_SWIZZLE4(3, 3, 3, 3) }; struct gl_context *ctx = &brw->ctx; assert(devinfo->gen < 7); memset(key, 0, sizeof(*key)); /* BRW_NEW_VS_PROG_DATA (part of VUE map) */ key->attrs = brw_vue_prog_data(brw->vs.base.prog_data)->vue_map.slots_valid; /* BRW_NEW_PRIMITIVE */ key->primitive = brw->primitive; /* _NEW_LIGHT */ key->pv_first = (ctx->Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION); if (key->primitive == _3DPRIM_QUADLIST && ctx->Light.ShadeModel != GL_FLAT) { /* Provide consistent primitive order with brw_set_prim's * optimization of single quads to trifans. */ key->pv_first = true; } if (devinfo->gen == 6) { /* On Gen6, GS is used for transform feedback. */ /* BRW_NEW_TRANSFORM_FEEDBACK */ if (_mesa_is_xfb_active_and_unpaused(ctx)) { const struct gl_program *prog = ctx->_Shader->CurrentProgram[MESA_SHADER_VERTEX]; const struct gl_transform_feedback_info *linked_xfb_info = prog->sh.LinkedTransformFeedback; int i; /* Make sure that the VUE slots won't overflow the unsigned chars in * key->transform_feedback_bindings[]. */ STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 256); /* Make sure that we don't need more binding table entries than we've * set aside for use in transform feedback. (We shouldn't, since we * set aside enough binding table entries to have one per component). */ assert(linked_xfb_info->NumOutputs <= BRW_MAX_SOL_BINDINGS); key->need_gs_prog = true; key->num_transform_feedback_bindings = linked_xfb_info->NumOutputs; for (i = 0; i < key->num_transform_feedback_bindings; ++i) { key->transform_feedback_bindings[i] = linked_xfb_info->Outputs[i].OutputRegister; key->transform_feedback_swizzles[i] = swizzle_for_offset[linked_xfb_info->Outputs[i].ComponentOffset]; } } } else { /* Pre-gen6, GS is used to transform QUADLIST, QUADSTRIP, and LINELOOP * into simpler primitives. */ key->need_gs_prog = (brw->primitive == _3DPRIM_QUADLIST || brw->primitive == _3DPRIM_QUADSTRIP || brw->primitive == _3DPRIM_LINELOOP); } }
static void upload_vs_state(struct brw_context *brw) { const struct gen_device_info *devinfo = &brw->screen->devinfo; const struct brw_stage_state *stage_state = &brw->vs.base; const struct brw_stage_prog_data *prog_data = stage_state->prog_data; const struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(stage_state->prog_data); uint32_t floating_point_mode = 0; /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State, * 3DSTATE_VS, Dword 5.0 "VS Function Enable": * * [DevSNB] A pipeline flush must be programmed prior to a 3DSTATE_VS * command that causes the VS Function Enable to toggle. Pipeline * flush can be executed by sending a PIPE_CONTROL command with CS * stall bit set and a post sync operation. * * We've already done such a flush at the start of state upload, so we * don't need to do another one here. */ if (stage_state->push_const_size == 0) { /* Disable the push constant buffers. */ BEGIN_BATCH(5); OUT_BATCH(_3DSTATE_CONSTANT_VS << 16 | (5 - 2)); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } else { BEGIN_BATCH(5); OUT_BATCH(_3DSTATE_CONSTANT_VS << 16 | GEN6_CONSTANT_BUFFER_0_ENABLE | (5 - 2)); /* Pointer to the VS constant buffer. Covered by the set of * state flags from gen6_upload_vs_constants */ OUT_BATCH(stage_state->push_const_offset + stage_state->push_const_size - 1); OUT_BATCH(0); OUT_BATCH(0); OUT_BATCH(0); ADVANCE_BATCH(); } if (prog_data->use_alt_mode) floating_point_mode = GEN6_VS_FLOATING_POINT_MODE_ALT; BEGIN_BATCH(6); OUT_BATCH(_3DSTATE_VS << 16 | (6 - 2)); OUT_BATCH(stage_state->prog_offset); OUT_BATCH(floating_point_mode | ((ALIGN(stage_state->sampler_count, 4)/4) << GEN6_VS_SAMPLER_COUNT_SHIFT) | ((prog_data->binding_table.size_bytes / 4) << GEN6_VS_BINDING_TABLE_ENTRY_COUNT_SHIFT)); if (prog_data->total_scratch) { OUT_RELOC(stage_state->scratch_bo, I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER, ffs(stage_state->per_thread_scratch) - 11); } else { OUT_BATCH(0); } OUT_BATCH((prog_data->dispatch_grf_start_reg << GEN6_VS_DISPATCH_START_GRF_SHIFT) | (vue_prog_data->urb_read_length << GEN6_VS_URB_READ_LENGTH_SHIFT) | (0 << GEN6_VS_URB_ENTRY_READ_OFFSET_SHIFT)); OUT_BATCH(((devinfo->max_vs_threads - 1) << GEN6_VS_MAX_THREADS_SHIFT) | GEN6_VS_STATISTICS_ENABLE | GEN6_VS_ENABLE); ADVANCE_BATCH(); /* Based on my reading of the simulator, the VS constants don't get * pulled into the VS FF unit until an appropriate pipeline flush * happens, and instead the 3DSTATE_CONSTANT_VS packet just adds * references to them into a little FIFO. The flushes are common, * but don't reliably happen between this and a 3DPRIMITIVE, causing * the primitive to use the wrong constants. Then the FIFO * containing the constant setup gets added to again on the next * constants change, and eventually when a flush does happen the * unit is overwhelmed by constant changes and dies. * * To avoid this, send a PIPE_CONTROL down the line that will * update the unit immediately loading the constants. The flush * type bits here were those set by the STATE_BASE_ADDRESS whose * move in a82a43e8d99e1715dd11c9c091b5ab734079b6a6 triggered the * bug reports that led to this workaround, and may be more than * what is strictly required to avoid the issue. */ brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL | PIPE_CONTROL_INSTRUCTION_INVALIDATE | PIPE_CONTROL_STATE_CACHE_INVALIDATE); }