bool brw_wm_do_compile(struct brw_context *brw, struct brw_wm_compile *c) { struct gl_context *ctx = &brw->ctx; struct gl_shader *fs = NULL; if (c->shader_prog) fs = c->shader_prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; /* Allocate the references to the uniforms that will end up in the * prog_data associated with the compiled program, and which will be freed * by the state cache. */ int param_count; if (fs) { param_count = fs->num_uniform_components; } else { param_count = c->fp->program.Base.Parameters->NumParameters * 4; } /* The backend also sometimes adds params for texture size. */ param_count += 2 * ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits; c->prog_data.base.param = rzalloc_array(NULL, const float *, param_count); c->prog_data.base.pull_param = rzalloc_array(NULL, const float *, param_count); c->prog_data.base.nr_params = param_count; c->prog_data.barycentric_interp_modes = brw_compute_barycentric_interp_modes(brw, c->key.flat_shade, c->key.persample_shading, &c->fp->program); c->program = brw_wm_fs_emit(brw, c, &c->fp->program, c->shader_prog, &c->program_size); if (c->program == NULL) return false; /* Scratch space is used for register spilling */ if (c->last_scratch) { perf_debug("Fragment shader triggered register spilling. " "Try reducing the number of live scalar values to " "improve performance.\n"); c->prog_data.total_scratch = brw_get_scratch_size(c->last_scratch); } if (unlikely(INTEL_DEBUG & DEBUG_WM)) fprintf(stderr, "\n"); return true; }
bool brw_codegen_gs_prog(struct brw_context *brw, struct gl_shader_program *prog, struct brw_geometry_program *gp, struct brw_gs_prog_key *key) { struct brw_stage_state *stage_state = &brw->gs.base; struct brw_gs_compile c; memset(&c, 0, sizeof(c)); c.key = *key; c.gp = gp; c.prog_data.include_primitive_id = (gp->program.Base.InputsRead & VARYING_BIT_PRIMITIVE_ID) != 0; c.prog_data.invocations = gp->program.Invocations; /* Allocate the references to the uniforms that will end up in the * prog_data associated with the compiled program, and which will be freed * by the state cache. * * Note: param_count needs to be num_uniform_components * 4, since we add * padding around uniform values below vec4 size, so the worst case is that * every uniform is a float which gets padded to the size of a vec4. */ struct gl_shader *gs = prog->_LinkedShaders[MESA_SHADER_GEOMETRY]; int param_count = gs->num_uniform_components * 4; /* We also upload clip plane data as uniforms */ param_count += MAX_CLIP_PLANES * 4; c.prog_data.base.base.param = rzalloc_array(NULL, const gl_constant_value *, param_count); c.prog_data.base.base.pull_param = rzalloc_array(NULL, const gl_constant_value *, param_count); c.prog_data.base.base.nr_params = param_count; if (brw->gen >= 7) { if (gp->program.OutputType == GL_POINTS) { /* When the output type is points, the geometry shader may output data * to multiple streams, and EndPrimitive() has no effect. So we * configure the hardware to interpret the control data as stream ID. */ c.prog_data.control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID; /* We only have to emit control bits if we are using streams */ if (prog->Geom.UsesStreams) c.control_data_bits_per_vertex = 2; else c.control_data_bits_per_vertex = 0; } else { /* When the output type is triangle_strip or line_strip, EndPrimitive() * may be used to terminate the current strip and start a new one * (similar to primitive restart), and outputting data to multiple * streams is not supported. So we configure the hardware to interpret * the control data as EndPrimitive information (a.k.a. "cut bits"). */ c.prog_data.control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT; /* We only need to output control data if the shader actually calls * EndPrimitive(). */ c.control_data_bits_per_vertex = gp->program.UsesEndPrimitive ? 1 : 0; } } else { /* There are no control data bits in gen6. */ c.control_data_bits_per_vertex = 0; /* If it is using transform feedback, enable it */ if (prog->TransformFeedback.NumVarying) c.prog_data.gen6_xfb_enabled = true; else c.prog_data.gen6_xfb_enabled = false; } c.control_data_header_size_bits = gp->program.VerticesOut * c.control_data_bits_per_vertex; /* 1 HWORD = 32 bytes = 256 bits */ c.prog_data.control_data_header_size_hwords = ALIGN(c.control_data_header_size_bits, 256) / 256; GLbitfield64 outputs_written = gp->program.Base.OutputsWritten; /* In order for legacy clipping to work, we need to populate the clip * distance varying slots whenever clipping is enabled, even if the vertex * shader doesn't write to gl_ClipDistance. */ if (c.key.base.userclip_active) { outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0); outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1); } brw_compute_vue_map(brw->intelScreen->devinfo, &c.prog_data.base.vue_map, outputs_written); /* Compute the output vertex size. * * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 STATE_GS - Output Vertex * Size (p168): * * [0,62] indicating [1,63] 16B units * * Specifies the size of each vertex stored in the GS output entry * (following any Control Header data) as a number of 128-bit units * (minus one). * * Programming Restrictions: The vertex size must be programmed as a * multiple of 32B units with the following exception: Rendering is * disabled (as per SOL stage state) and the vertex size output by the * GS thread is 16B. * * If rendering is enabled (as per SOL state) the vertex size must be * programmed as a multiple of 32B units. In other words, the only time * software can program a vertex size with an odd number of 16B units * is when rendering is disabled. * * Note: B=bytes in the above text. * * It doesn't seem worth the extra trouble to optimize the case where the * vertex size is 16B (especially since this would require special-casing * the GEN assembly that writes to the URB). So we just set the vertex * size to a multiple of 32B (2 vec4's) in all cases. * * The maximum output vertex size is 62*16 = 992 bytes (31 hwords). We * budget that as follows: * * 512 bytes for varyings (a varying component is 4 bytes and * gl_MaxGeometryOutputComponents = 128) * 16 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16 * bytes) * 16 bytes overhead for gl_Position (we allocate it a slot in the VUE * even if it's not used) * 32 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots * whenever clip planes are enabled, even if the shader doesn't * write to gl_ClipDistance) * 16 bytes overhead since the VUE size must be a multiple of 32 bytes * (see below)--this causes up to 1 VUE slot to be wasted * 400 bytes available for varying packing overhead * * Worst-case varying packing overhead is 3/4 of a varying slot (12 bytes) * per interpolation type, so this is plenty. * */ unsigned output_vertex_size_bytes = c.prog_data.base.vue_map.num_slots * 16; assert(brw->gen == 6 || output_vertex_size_bytes <= GEN7_MAX_GS_OUTPUT_VERTEX_SIZE_BYTES); c.prog_data.output_vertex_size_hwords = ALIGN(output_vertex_size_bytes, 32) / 32; /* Compute URB entry size. The maximum allowed URB entry size is 32k. * That divides up as follows: * * 64 bytes for the control data header (cut indices or StreamID bits) * 4096 bytes for varyings (a varying component is 4 bytes and * gl_MaxGeometryTotalOutputComponents = 1024) * 4096 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16 * bytes/vertex and gl_MaxGeometryOutputVertices is 256) * 4096 bytes overhead for gl_Position (we allocate it a slot in the VUE * even if it's not used) * 8192 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots * whenever clip planes are enabled, even if the shader doesn't * write to gl_ClipDistance) * 4096 bytes overhead since the VUE size must be a multiple of 32 * bytes (see above)--this causes up to 1 VUE slot to be wasted * 8128 bytes available for varying packing overhead * * Worst-case varying packing overhead is 3/4 of a varying slot per * interpolation type, which works out to 3072 bytes, so this would allow * us to accommodate 2 interpolation types without any danger of running * out of URB space. * * In practice, the risk of running out of URB space is very small, since * the above figures are all worst-case, and most of them scale with the * number of output vertices. So we'll just calculate the amount of space * we need, and if it's too large, fail to compile. * * The above is for gen7+ where we have a single URB entry that will hold * all the output. In gen6, we will have to allocate URB entries for every * vertex we emit, so our URB entries only need to be large enough to hold * a single vertex. Also, gen6 does not have a control data header. */ unsigned output_size_bytes; if (brw->gen >= 7) { output_size_bytes = c.prog_data.output_vertex_size_hwords * 32 * gp->program.VerticesOut; output_size_bytes += 32 * c.prog_data.control_data_header_size_hwords; } else { output_size_bytes = c.prog_data.output_vertex_size_hwords * 32; } /* Broadwell stores "Vertex Count" as a full 8 DWord (32 byte) URB output, * which comes before the control header. */ if (brw->gen >= 8) output_size_bytes += 32; assert(output_size_bytes >= 1); int max_output_size_bytes = GEN7_MAX_GS_URB_ENTRY_SIZE_BYTES; if (brw->gen == 6) max_output_size_bytes = GEN6_MAX_GS_URB_ENTRY_SIZE_BYTES; if (output_size_bytes > max_output_size_bytes) return false; /* URB entry sizes are stored as a multiple of 64 bytes in gen7+ and * a multiple of 128 bytes in gen6. */ if (brw->gen >= 7) c.prog_data.base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64; else c.prog_data.base.urb_entry_size = ALIGN(output_size_bytes, 128) / 128; c.prog_data.output_topology = get_hw_prim_for_gl_prim(gp->program.OutputType); brw_compute_vue_map(brw->intelScreen->devinfo, &c.input_vue_map, c.key.input_varyings); /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we * need to program a URB read length of ceiling(num_slots / 2). */ c.prog_data.base.urb_read_length = (c.input_vue_map.num_slots + 1) / 2; void *mem_ctx = ralloc_context(NULL); unsigned program_size; const unsigned *program = brw_gs_emit(brw, prog, &c, mem_ctx, &program_size); if (program == NULL) { ralloc_free(mem_ctx); return false; } /* Scratch space is used for register spilling */ if (c.base.last_scratch) { perf_debug("Geometry shader triggered register spilling. " "Try reducing the number of live vec4 values to " "improve performance.\n"); c.prog_data.base.base.total_scratch = brw_get_scratch_size(c.base.last_scratch*REG_SIZE); brw_get_scratch_bo(brw, &stage_state->scratch_bo, c.prog_data.base.base.total_scratch * brw->max_gs_threads); } brw_upload_cache(&brw->cache, BRW_CACHE_GS_PROG, &c.key, sizeof(c.key), program, program_size, &c.prog_data, sizeof(c.prog_data), &stage_state->prog_offset, &brw->gs.prog_data); ralloc_free(mem_ctx); return true; }
static bool do_vs_prog(struct brw_context *brw, struct gl_shader_program *prog, struct brw_vertex_program *vp, struct brw_vs_prog_key *key) { GLuint program_size; const GLuint *program; struct brw_vs_compile c; struct brw_vs_prog_data prog_data; void *mem_ctx; int i; struct gl_shader *vs = NULL; if (prog) vs = prog->_LinkedShaders[MESA_SHADER_VERTEX]; memset(&c, 0, sizeof(c)); memcpy(&c.key, key, sizeof(*key)); memset(&prog_data, 0, sizeof(prog_data)); mem_ctx = ralloc_context(NULL); c.vp = vp; /* Allocate the references to the uniforms that will end up in the * prog_data associated with the compiled program, and which will be freed * by the state cache. */ int param_count; if (vs) { /* We add padding around uniform values below vec4 size, with the worst * case being a float value that gets blown up to a vec4, so be * conservative here. */ param_count = vs->num_uniform_components * 4; } else { param_count = vp->program.Base.Parameters->NumParameters * 4; } /* We also upload clip plane data as uniforms */ param_count += MAX_CLIP_PLANES * 4; prog_data.base.param = rzalloc_array(NULL, const float *, param_count); prog_data.base.pull_param = rzalloc_array(NULL, const float *, param_count); GLbitfield64 outputs_written = vp->program.Base.OutputsWritten; prog_data.inputs_read = vp->program.Base.InputsRead; if (c.key.copy_edgeflag) { outputs_written |= BITFIELD64_BIT(VARYING_SLOT_EDGE); prog_data.inputs_read |= VERT_BIT_EDGEFLAG; } if (brw->gen < 6) { /* Put dummy slots into the VUE for the SF to put the replaced * point sprite coords in. We shouldn't need these dummy slots, * which take up precious URB space, but it would mean that the SF * doesn't get nice aligned pairs of input coords into output * coords, which would be a pain to handle. */ for (i = 0; i < 8; i++) { if (c.key.point_coord_replace & (1 << i)) outputs_written |= BITFIELD64_BIT(VARYING_SLOT_TEX0 + i); } /* if back colors are written, allocate slots for front colors too */ if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_BFC0)) outputs_written |= BITFIELD64_BIT(VARYING_SLOT_COL0); if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_BFC1)) outputs_written |= BITFIELD64_BIT(VARYING_SLOT_COL1); } brw_compute_vue_map(brw, &prog_data.base.vue_map, outputs_written, c.key.base.userclip_active); if (0) { _mesa_fprint_program_opt(stdout, &c.vp->program.Base, PROG_PRINT_DEBUG, true); } /* Emit GEN4 code. */ program = brw_vs_emit(brw, prog, &c, &prog_data, mem_ctx, &program_size); if (program == NULL) { ralloc_free(mem_ctx); return false; } if (prog_data.base.nr_pull_params) prog_data.base.num_surfaces = 1; if (c.vp->program.Base.SamplersUsed) prog_data.base.num_surfaces = SURF_INDEX_VS_TEXTURE(BRW_MAX_TEX_UNIT); if (prog && prog->_LinkedShaders[MESA_SHADER_VERTEX]->NumUniformBlocks) { prog_data.base.num_surfaces = SURF_INDEX_VS_UBO(prog->_LinkedShaders[MESA_SHADER_VERTEX]->NumUniformBlocks); } /* Scratch space is used for register spilling */ if (c.base.last_scratch) { perf_debug("Vertex shader triggered register spilling. " "Try reducing the number of live vec4 values to " "improve performance.\n"); prog_data.base.total_scratch = brw_get_scratch_size(c.base.last_scratch*REG_SIZE); brw_get_scratch_bo(brw, &brw->vs.scratch_bo, prog_data.base.total_scratch * brw->max_vs_threads); } brw_upload_cache(&brw->cache, BRW_VS_PROG, &c.key, sizeof(c.key), program, program_size, &prog_data, sizeof(prog_data), &brw->vs.prog_offset, &brw->vs.prog_data); ralloc_free(mem_ctx); return true; }
static bool brw_vs_do_compile(struct brw_context *brw, struct brw_vs_compile *c) { struct brw_stage_prog_data *stage_prog_data = &c->prog_data.base.base; struct gl_shader *vs = NULL; int i; if (c->base.shader_prog) vs = c->base.shader_prog->_LinkedShaders[MESA_SHADER_VERTEX]; /* Allocate the references to the uniforms that will end up in the * prog_data associated with the compiled program, and which will be freed * by the state cache. */ int param_count; if (vs) { /* We add padding around uniform values below vec4 size, with the worst * case being a float value that gets blown up to a vec4, so be * conservative here. */ param_count = vs->num_uniform_components * 4; } else { param_count = c->vp->program.Base.Parameters->NumParameters * 4; } /* vec4_visitor::setup_uniform_clipplane_values() also uploads user clip * planes as uniforms. */ param_count += c->key.base.nr_userclip_plane_consts * 4; stage_prog_data->param = rzalloc_array(NULL, const float *, param_count); stage_prog_data->pull_param = rzalloc_array(NULL, const float *, param_count); /* Setting nr_params here NOT to the size of the param and pull_param * arrays, but to the number of uniform components vec4_visitor * needs. vec4_visitor::setup_uniforms() will set it back to a proper value. */ stage_prog_data->nr_params = ALIGN(param_count, 4) / 4; if (vs) { stage_prog_data->nr_params += vs->num_samplers; } GLbitfield64 outputs_written = c->vp->program.Base.OutputsWritten; c->prog_data.inputs_read = c->vp->program.Base.InputsRead; if (c->key.copy_edgeflag) { outputs_written |= BITFIELD64_BIT(VARYING_SLOT_EDGE); c->prog_data.inputs_read |= VERT_BIT_EDGEFLAG; } if (brw->gen < 6) { /* Put dummy slots into the VUE for the SF to put the replaced * point sprite coords in. We shouldn't need these dummy slots, * which take up precious URB space, but it would mean that the SF * doesn't get nice aligned pairs of input coords into output * coords, which would be a pain to handle. */ for (i = 0; i < 8; i++) { if (c->key.point_coord_replace & (1 << i)) outputs_written |= BITFIELD64_BIT(VARYING_SLOT_TEX0 + i); } /* if back colors are written, allocate slots for front colors too */ if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_BFC0)) outputs_written |= BITFIELD64_BIT(VARYING_SLOT_COL0); if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_BFC1)) outputs_written |= BITFIELD64_BIT(VARYING_SLOT_COL1); } /* In order for legacy clipping to work, we need to populate the clip * distance varying slots whenever clipping is enabled, even if the vertex * shader doesn't write to gl_ClipDistance. */ if (c->key.base.userclip_active) { outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0); outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1); } brw_compute_vue_map(brw, &c->prog_data.base.vue_map, outputs_written); if (0) { _mesa_fprint_program_opt(stderr, &c->vp->program.Base, PROG_PRINT_DEBUG, true); } /* Emit GEN4 code. */ c->base.program = brw_vs_emit(brw, c->base.shader_prog, c, &c->prog_data, c->base.mem_ctx, &c->base.program_size); if (c->base.program == NULL) return false; if (c->base.last_scratch) { c->prog_data.base.total_scratch = brw_get_scratch_size(c->base.last_scratch*REG_SIZE); } return true; }
/** * All Mesa program -> GPU code generation goes through this function. * Depending on the instructions used (i.e. flow control instructions) * we'll use one of two code generators. */ bool do_wm_prog(struct brw_context *brw, struct gl_shader_program *prog, struct brw_fragment_program *fp, struct brw_wm_prog_key *key) { struct brw_wm_compile *c; const GLuint *program; struct gl_shader *fs = NULL; GLuint program_size; if (prog) fs = prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; c = rzalloc(NULL, struct brw_wm_compile); /* Allocate the references to the uniforms that will end up in the * prog_data associated with the compiled program, and which will be freed * by the state cache. */ int param_count; if (fs) { param_count = fs->num_uniform_components; } else { param_count = fp->program.Base.Parameters->NumParameters * 4; } /* The backend also sometimes adds params for texture size. */ param_count += 2 * BRW_MAX_TEX_UNIT; c->prog_data.param = rzalloc_array(NULL, const float *, param_count); c->prog_data.pull_param = rzalloc_array(NULL, const float *, param_count); memcpy(&c->key, key, sizeof(*key)); c->prog_data.barycentric_interp_modes = brw_compute_barycentric_interp_modes(brw, c->key.flat_shade, &fp->program); program = brw_wm_fs_emit(brw, c, &fp->program, prog, &program_size); if (program == NULL) return false; /* Scratch space is used for register spilling */ if (c->last_scratch) { perf_debug("Fragment shader triggered register spilling. " "Try reducing the number of live scalar values to " "improve performance.\n"); c->prog_data.total_scratch = brw_get_scratch_size(c->last_scratch); brw_get_scratch_bo(brw, &brw->wm.scratch_bo, c->prog_data.total_scratch * brw->max_wm_threads); } if (unlikely(INTEL_DEBUG & DEBUG_WM)) fprintf(stderr, "\n"); brw_upload_cache(&brw->cache, BRW_WM_PROG, &c->key, sizeof(c->key), program, program_size, &c->prog_data, sizeof(c->prog_data), &brw->wm.prog_offset, &brw->wm.prog_data); ralloc_free(c); return true; }
static bool do_vs_prog(struct brw_context *brw, struct gl_shader_program *prog, struct brw_vertex_program *vp, struct brw_vs_prog_key *key) { struct gl_context *ctx = &brw->intel.ctx; struct intel_context *intel = &brw->intel; GLuint program_size; const GLuint *program; struct brw_vs_compile c; void *mem_ctx; int aux_size; int i; struct gl_shader *vs = NULL; if (prog) vs = prog->_LinkedShaders[MESA_SHADER_VERTEX]; memset(&c, 0, sizeof(c)); memcpy(&c.key, key, sizeof(*key)); mem_ctx = ralloc_context(NULL); brw_init_compile(brw, &c.func, mem_ctx); c.vp = vp; /* Allocate the references to the uniforms that will end up in the * prog_data associated with the compiled program, and which will be freed * by the state cache. */ int param_count; if (vs) { /* We add padding around uniform values below vec4 size, with the worst * case being a float value that gets blown up to a vec4, so be * conservative here. */ param_count = vs->num_uniform_components * 4; /* We also upload clip plane data as uniforms */ param_count += MAX_CLIP_PLANES * 4; } else { param_count = vp->program.Base.Parameters->NumParameters * 4; } c.prog_data.param = rzalloc_array(NULL, const float *, param_count); c.prog_data.pull_param = rzalloc_array(NULL, const float *, param_count); c.prog_data.outputs_written = vp->program.Base.OutputsWritten; c.prog_data.inputs_read = vp->program.Base.InputsRead; if (c.key.copy_edgeflag) { c.prog_data.outputs_written |= BITFIELD64_BIT(VERT_RESULT_EDGE); c.prog_data.inputs_read |= VERT_BIT_EDGEFLAG; } /* Put dummy slots into the VUE for the SF to put the replaced * point sprite coords in. We shouldn't need these dummy slots, * which take up precious URB space, but it would mean that the SF * doesn't get nice aligned pairs of input coords into output * coords, which would be a pain to handle. */ for (i = 0; i < 8; i++) { if (c.key.point_coord_replace & (1 << i)) c.prog_data.outputs_written |= BITFIELD64_BIT(VERT_RESULT_TEX0 + i); } brw_compute_vue_map(&c); if (0) { _mesa_fprint_program_opt(stdout, &c.vp->program.Base, PROG_PRINT_DEBUG, true); } /* Emit GEN4 code. */ if (prog) { if (!brw_vs_emit(prog, &c)) { ralloc_free(mem_ctx); return false; } } else { brw_old_vs_emit(&c); } if (c.prog_data.nr_pull_params) c.prog_data.num_surfaces = 1; if (c.vp->program.Base.SamplersUsed) c.prog_data.num_surfaces = SURF_INDEX_VS_TEXTURE(BRW_MAX_TEX_UNIT); if (prog && prog->_LinkedShaders[MESA_SHADER_VERTEX]->NumUniformBlocks) { c.prog_data.num_surfaces = SURF_INDEX_VS_UBO(prog->_LinkedShaders[MESA_SHADER_VERTEX]->NumUniformBlocks); } /* Scratch space is used for register spilling */ if (c.last_scratch) { perf_debug("Vertex shader triggered register spilling. " "Try reducing the number of live vec4 values to " "improve performance.\n"); c.prog_data.total_scratch = brw_get_scratch_size(c.last_scratch); brw_get_scratch_bo(intel, &brw->vs.scratch_bo, c.prog_data.total_scratch * brw->max_vs_threads); } /* get the program */ program = brw_get_program(&c.func, &program_size); /* We upload from &c.prog_data including the constant_map assuming * they're packed together. It would be nice to have a * compile-time assert macro here. */ assert(c.constant_map == (int8_t *)&c.prog_data + sizeof(c.prog_data)); assert(ctx->Const.VertexProgram.MaxNativeParameters == ARRAY_SIZE(c.constant_map)); (void) ctx; aux_size = sizeof(c.prog_data); /* constant_map */ aux_size += c.vp->program.Base.Parameters->NumParameters; brw_upload_cache(&brw->cache, BRW_VS_PROG, &c.key, sizeof(c.key), program, program_size, &c.prog_data, aux_size, &brw->vs.prog_offset, &brw->vs.prog_data); ralloc_free(mem_ctx); return true; }
static bool do_vs_prog(struct brw_context *brw, struct gl_shader_program *prog, struct brw_vertex_program *vp, struct brw_vs_prog_key *key) { GLuint program_size; const GLuint *program; struct brw_vs_compile c; struct brw_vs_prog_data prog_data; struct brw_stage_prog_data *stage_prog_data = &prog_data.base.base; void *mem_ctx; int i; struct gl_shader *vs = NULL; if (prog) vs = prog->_LinkedShaders[MESA_SHADER_VERTEX]; memset(&c, 0, sizeof(c)); memcpy(&c.key, key, sizeof(*key)); memset(&prog_data, 0, sizeof(prog_data)); mem_ctx = ralloc_context(NULL); c.vp = vp; /* Allocate the references to the uniforms that will end up in the * prog_data associated with the compiled program, and which will be freed * by the state cache. */ int param_count; if (vs) { /* We add padding around uniform values below vec4 size, with the worst * case being a float value that gets blown up to a vec4, so be * conservative here. */ param_count = vs->num_uniform_components * 4; } else { param_count = vp->program.Base.Parameters->NumParameters * 4; } /* vec4_visitor::setup_uniform_clipplane_values() also uploads user clip * planes as uniforms. */ param_count += c.key.base.nr_userclip_plane_consts * 4; stage_prog_data->param = rzalloc_array(NULL, const float *, param_count); stage_prog_data->pull_param = rzalloc_array(NULL, const float *, param_count); /* Setting nr_params here NOT to the size of the param and pull_param * arrays, but to the number of uniform components vec4_visitor * needs. vec4_visitor::setup_uniforms() will set it back to a proper value. */ stage_prog_data->nr_params = ALIGN(param_count, 4) / 4; if (vs) { stage_prog_data->nr_params += vs->num_samplers; } GLbitfield64 outputs_written = vp->program.Base.OutputsWritten; prog_data.inputs_read = vp->program.Base.InputsRead; if (c.key.copy_edgeflag) { outputs_written |= BITFIELD64_BIT(VARYING_SLOT_EDGE); prog_data.inputs_read |= VERT_BIT_EDGEFLAG; } if (brw->gen < 6) { /* Put dummy slots into the VUE for the SF to put the replaced * point sprite coords in. We shouldn't need these dummy slots, * which take up precious URB space, but it would mean that the SF * doesn't get nice aligned pairs of input coords into output * coords, which would be a pain to handle. */ for (i = 0; i < 8; i++) { if (c.key.point_coord_replace & (1 << i)) outputs_written |= BITFIELD64_BIT(VARYING_SLOT_TEX0 + i); } /* if back colors are written, allocate slots for front colors too */ if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_BFC0)) outputs_written |= BITFIELD64_BIT(VARYING_SLOT_COL0); if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_BFC1)) outputs_written |= BITFIELD64_BIT(VARYING_SLOT_COL1); } /* In order for legacy clipping to work, we need to populate the clip * distance varying slots whenever clipping is enabled, even if the vertex * shader doesn't write to gl_ClipDistance. */ if (c.key.base.userclip_active) { outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0); outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1); } brw_compute_vue_map(brw, &prog_data.base.vue_map, outputs_written); if (0) { _mesa_fprint_program_opt(stderr, &c.vp->program.Base, PROG_PRINT_DEBUG, true); } /* Emit GEN4 code. */ program = brw_vs_emit(brw, prog, &c, &prog_data, mem_ctx, &program_size); if (program == NULL) { ralloc_free(mem_ctx); return false; } /* Scratch space is used for register spilling */ if (c.base.last_scratch) { perf_debug("Vertex shader triggered register spilling. " "Try reducing the number of live vec4 values to " "improve performance.\n"); prog_data.base.total_scratch = brw_get_scratch_size(c.base.last_scratch*REG_SIZE); brw_get_scratch_bo(brw, &brw->vs.base.scratch_bo, prog_data.base.total_scratch * brw->max_vs_threads); } brw_upload_cache(&brw->cache, BRW_VS_PROG, &c.key, sizeof(c.key), program, program_size, &prog_data, sizeof(prog_data), &brw->vs.base.prog_offset, &brw->vs.prog_data); ralloc_free(mem_ctx); return true; }