bool brw_codegen_vs_prog(struct brw_context *brw, struct gl_shader_program *prog, struct brw_vertex_program *vp, struct brw_vs_prog_key *key) { GLuint program_size; const GLuint *program; struct brw_vs_prog_data prog_data; struct brw_stage_prog_data *stage_prog_data = &prog_data.base.base; void *mem_ctx; int i; struct brw_shader *vs = NULL; bool start_busy = false; double start_time = 0; if (prog) vs = (struct brw_shader *) prog->_LinkedShaders[MESA_SHADER_VERTEX]; memset(&prog_data, 0, sizeof(prog_data)); /* Use ALT floating point mode for ARB programs so that 0^0 == 1. */ if (!prog) stage_prog_data->use_alt_mode = true; mem_ctx = ralloc_context(NULL); brw_assign_common_binding_table_offsets(MESA_SHADER_VERTEX, brw->intelScreen->devinfo, prog, &vp->program.Base, &prog_data.base.base, 0); /* Allocate the references to the uniforms that will end up in the * prog_data associated with the compiled program, and which will be freed * by the state cache. */ int param_count = vp->program.Base.nir->num_uniforms; if (!brw->intelScreen->compiler->scalar_vs) param_count *= 4; if (vs) prog_data.base.base.nr_image_params = vs->base.NumImages; /* vec4_visitor::setup_uniform_clipplane_values() also uploads user clip * planes as uniforms. */ param_count += key->nr_userclip_plane_consts * 4; stage_prog_data->param = rzalloc_array(NULL, const gl_constant_value *, param_count); stage_prog_data->pull_param = rzalloc_array(NULL, const gl_constant_value *, param_count); stage_prog_data->image_param = rzalloc_array(NULL, struct brw_image_param, stage_prog_data->nr_image_params); stage_prog_data->nr_params = param_count; if (prog) { brw_nir_setup_glsl_uniforms(vp->program.Base.nir, prog, &vp->program.Base, &prog_data.base.base, brw->intelScreen->compiler->scalar_vs); } else { brw_nir_setup_arb_uniforms(vp->program.Base.nir, &vp->program.Base, &prog_data.base.base); } GLbitfield64 outputs_written = vp->program.Base.OutputsWritten; prog_data.inputs_read = vp->program.Base.InputsRead; if (key->copy_edgeflag) { outputs_written |= BITFIELD64_BIT(VARYING_SLOT_EDGE); prog_data.inputs_read |= VERT_BIT_EDGEFLAG; } if (brw->gen < 6) { /* Put dummy slots into the VUE for the SF to put the replaced * point sprite coords in. We shouldn't need these dummy slots, * which take up precious URB space, but it would mean that the SF * doesn't get nice aligned pairs of input coords into output * coords, which would be a pain to handle. */ for (i = 0; i < 8; i++) { if (key->point_coord_replace & (1 << i)) outputs_written |= BITFIELD64_BIT(VARYING_SLOT_TEX0 + i); } /* if back colors are written, allocate slots for front colors too */ if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_BFC0)) outputs_written |= BITFIELD64_BIT(VARYING_SLOT_COL0); if (outputs_written & BITFIELD64_BIT(VARYING_SLOT_BFC1)) outputs_written |= BITFIELD64_BIT(VARYING_SLOT_COL1); } /* In order for legacy clipping to work, we need to populate the clip * distance varying slots whenever clipping is enabled, even if the vertex * shader doesn't write to gl_ClipDistance. */ if (key->nr_userclip_plane_consts > 0) { outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0); outputs_written |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1); } brw_compute_vue_map(brw->intelScreen->devinfo, &prog_data.base.vue_map, outputs_written, prog ? prog->SeparateShader : false); if (0) { _mesa_fprint_program_opt(stderr, &vp->program.Base, PROG_PRINT_DEBUG, true); } if (unlikely(brw->perf_debug)) { start_busy = (brw->batch.last_bo && drm_intel_bo_busy(brw->batch.last_bo)); start_time = get_time(); } if (unlikely(INTEL_DEBUG & DEBUG_VS)) brw_dump_ir("vertex", prog, vs ? &vs->base : NULL, &vp->program.Base); int st_index = -1; if (INTEL_DEBUG & DEBUG_SHADER_TIME) st_index = brw_get_shader_time_index(brw, prog, &vp->program.Base, ST_VS); /* Emit GEN4 code. */ char *error_str; program = brw_compile_vs(brw->intelScreen->compiler, brw, mem_ctx, key, &prog_data, vp->program.Base.nir, brw_select_clip_planes(&brw->ctx), !_mesa_is_gles3(&brw->ctx), st_index, &program_size, &error_str); if (program == NULL) { if (prog) { prog->LinkStatus = false; ralloc_strcat(&prog->InfoLog, error_str); } _mesa_problem(NULL, "Failed to compile vertex shader: %s\n", error_str); ralloc_free(mem_ctx); return false; } if (unlikely(brw->perf_debug) && vs) { if (vs->compiled_once) { brw_vs_debug_recompile(brw, prog, key); } if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) { perf_debug("VS compile took %.03f ms and stalled the GPU\n", (get_time() - start_time) * 1000); } vs->compiled_once = true; } /* Scratch space is used for register spilling */ if (prog_data.base.base.total_scratch) { brw_get_scratch_bo(brw, &brw->vs.base.scratch_bo, prog_data.base.base.total_scratch * brw->max_vs_threads); } brw_upload_cache(&brw->cache, BRW_CACHE_VS_PROG, key, sizeof(struct brw_vs_prog_key), program, program_size, &prog_data, sizeof(prog_data), &brw->vs.base.prog_offset, &brw->vs.prog_data); ralloc_free(mem_ctx); return true; }
/* Upload a new set of constants. Too much variability to go into the * cache mechanism, but maybe would benefit from a comparison against * the current uploaded set of constants. */ static void brw_upload_constant_buffer(struct brw_context *brw) { struct intel_context *intel = &brw->intel; struct gl_context *ctx = &intel->ctx; const struct brw_vertex_program *vp = brw_vertex_program_const(brw->vertex_program); const GLuint sz = brw->curbe.total_size; const GLuint bufsz = sz * 16 * sizeof(GLfloat); GLfloat *buf; GLuint i; gl_clip_plane *clip_planes; if (sz == 0) { brw->curbe.last_bufsz = 0; goto emit; } buf = brw->curbe.next_buf; /* fragment shader constants */ if (brw->curbe.wm_size) { GLuint offset = brw->curbe.wm_start * 16; /* copy float constants */ for (i = 0; i < brw->wm.prog_data->nr_params; i++) { buf[offset + i] = convert_param(brw->wm.prog_data->param_convert[i], brw->wm.prog_data->param[i]); } } /* When using the old VS backend, the clipplanes are actually delivered to * both CLIP and VS units. VS uses them to calculate the outcode bitmasks. * * When using the new VS backend, it is responsible for setting up its own * clipplane constants if it needs them. This results in a slight waste of * of curbe space, but the advantage is that the new VS backend can use its * general-purpose uniform layout code to store the clipplanes. */ if (brw->curbe.clip_size) { GLuint offset = brw->curbe.clip_start * 16; GLuint j; /* If any planes are going this way, send them all this way: */ for (i = 0; i < 6; i++) { buf[offset + i * 4 + 0] = fixed_plane[i][0]; buf[offset + i * 4 + 1] = fixed_plane[i][1]; buf[offset + i * 4 + 2] = fixed_plane[i][2]; buf[offset + i * 4 + 3] = fixed_plane[i][3]; } /* Clip planes: _NEW_TRANSFORM plus _NEW_PROJECTION to get to * clip-space: */ clip_planes = brw_select_clip_planes(ctx); for (j = 0; j < MAX_CLIP_PLANES; j++) { if (ctx->Transform.ClipPlanesEnabled & (1<<j)) { buf[offset + i * 4 + 0] = clip_planes[j][0]; buf[offset + i * 4 + 1] = clip_planes[j][1]; buf[offset + i * 4 + 2] = clip_planes[j][2]; buf[offset + i * 4 + 3] = clip_planes[j][3]; i++; } } } /* vertex shader constants */ if (brw->curbe.vs_size) { GLuint offset = brw->curbe.vs_start * 16; GLuint nr = brw->vs.prog_data->nr_params / 4; if (brw->vs.prog_data->uses_new_param_layout) { for (i = 0; i < brw->vs.prog_data->nr_params; i++) { buf[offset + i] = *brw->vs.prog_data->param[i]; } } else { /* Load the subset of push constants that will get used when * we also have a pull constant buffer. */ for (i = 0; i < vp->program.Base.Parameters->NumParameters; i++) { if (brw->vs.constant_map[i] != -1) { assert(brw->vs.constant_map[i] <= nr); memcpy(buf + offset + brw->vs.constant_map[i] * 4, vp->program.Base.Parameters->ParameterValues[i], 4 * sizeof(float)); } } } } if (0) { for (i = 0; i < sz*16; i+=4) printf("curbe %d.%d: %f %f %f %f\n", i/8, i&4, buf[i+0], buf[i+1], buf[i+2], buf[i+3]); printf("last_buf %p buf %p sz %d/%d cmp %d\n", brw->curbe.last_buf, buf, bufsz, brw->curbe.last_bufsz, brw->curbe.last_buf ? memcmp(buf, brw->curbe.last_buf, bufsz) : -1); } if (brw->curbe.curbe_bo != NULL && bufsz == brw->curbe.last_bufsz && memcmp(buf, brw->curbe.last_buf, bufsz) == 0) { /* constants have not changed */ } else { /* Update the record of what our last set of constants was. We * don't just flip the pointers because we don't fill in the * data in the padding between the entries. */ memcpy(brw->curbe.last_buf, buf, bufsz); brw->curbe.last_bufsz = bufsz; if (brw->curbe.curbe_bo != NULL && brw->curbe.curbe_next_offset + bufsz > brw->curbe.curbe_bo->size) { drm_intel_gem_bo_unmap_gtt(brw->curbe.curbe_bo); drm_intel_bo_unreference(brw->curbe.curbe_bo); brw->curbe.curbe_bo = NULL; } if (brw->curbe.curbe_bo == NULL) { /* Allocate a single page for CURBE entries for this batchbuffer. * They're generally around 64b. */ brw->curbe.curbe_bo = drm_intel_bo_alloc(brw->intel.bufmgr, "CURBE", 4096, 1 << 6); brw->curbe.curbe_next_offset = 0; drm_intel_gem_bo_map_gtt(brw->curbe.curbe_bo); assert(bufsz < 4096); } brw->curbe.curbe_offset = brw->curbe.curbe_next_offset; brw->curbe.curbe_next_offset += bufsz; brw->curbe.curbe_next_offset = ALIGN(brw->curbe.curbe_next_offset, 64); /* Copy data to the buffer: */ memcpy(brw->curbe.curbe_bo->virtual + brw->curbe.curbe_offset, buf, bufsz); } /* Because this provokes an action (ie copy the constants into the * URB), it shouldn't be shortcircuited if identical to the * previous time - because eg. the urb destination may have * changed, or the urb contents different to last time. * * Note that the data referred to is actually copied internally, * not just used in place according to passed pointer. * * It appears that the CS unit takes care of using each available * URB entry (Const URB Entry == CURBE) in turn, and issuing * flushes as necessary when doublebuffering of CURBEs isn't * possible. */ emit: BEGIN_BATCH(2); if (brw->curbe.total_size == 0) { OUT_BATCH((CMD_CONST_BUFFER << 16) | (2 - 2)); OUT_BATCH(0); } else { OUT_BATCH((CMD_CONST_BUFFER << 16) | (1 << 8) | (2 - 2)); OUT_RELOC(brw->curbe.curbe_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, (brw->curbe.total_size - 1) + brw->curbe.curbe_offset); } ADVANCE_BATCH(); }
/** * Gathers together all the uniform values into a block of memory to be * uploaded into the CURBE, then emits the state packet telling the hardware * the new location. */ static void brw_upload_constant_buffer(struct brw_context *brw) { struct gl_context *ctx = &brw->ctx; /* BRW_NEW_CURBE_OFFSETS */ const GLuint sz = brw->curbe.total_size; const GLuint bufsz = sz * 16 * sizeof(GLfloat); gl_constant_value *buf; GLuint i; gl_clip_plane *clip_planes; if (sz == 0) { goto emit; } buf = intel_upload_space(brw, bufsz, 64, &brw->curbe.curbe_bo, &brw->curbe.curbe_offset); STATIC_ASSERT(sizeof(gl_constant_value) == sizeof(float)); /* fragment shader constants */ if (brw->curbe.wm_size) { _mesa_load_state_parameters(ctx, brw->fragment_program->Base.Parameters); /* BRW_NEW_CURBE_OFFSETS */ GLuint offset = brw->curbe.wm_start * 16; /* BRW_NEW_FS_PROG_DATA | _NEW_PROGRAM_CONSTANTS: copy uniform values */ for (i = 0; i < brw->wm.prog_data->base.nr_params; i++) { buf[offset + i] = *brw->wm.prog_data->base.param[i]; } } /* clipper constants */ if (brw->curbe.clip_size) { GLuint offset = brw->curbe.clip_start * 16; GLuint j; /* If any planes are going this way, send them all this way: */ for (i = 0; i < 6; i++) { buf[offset + i * 4 + 0].f = fixed_plane[i][0]; buf[offset + i * 4 + 1].f = fixed_plane[i][1]; buf[offset + i * 4 + 2].f = fixed_plane[i][2]; buf[offset + i * 4 + 3].f = fixed_plane[i][3]; } /* Clip planes: _NEW_TRANSFORM plus _NEW_PROJECTION to get to * clip-space: */ clip_planes = brw_select_clip_planes(ctx); for (j = 0; j < MAX_CLIP_PLANES; j++) { if (ctx->Transform.ClipPlanesEnabled & (1<<j)) { buf[offset + i * 4 + 0].f = clip_planes[j][0]; buf[offset + i * 4 + 1].f = clip_planes[j][1]; buf[offset + i * 4 + 2].f = clip_planes[j][2]; buf[offset + i * 4 + 3].f = clip_planes[j][3]; i++; } } } /* vertex shader constants */ if (brw->curbe.vs_size) { _mesa_load_state_parameters(ctx, brw->vertex_program->Base.Parameters); GLuint offset = brw->curbe.vs_start * 16; /* BRW_NEW_VS_PROG_DATA | _NEW_PROGRAM_CONSTANTS: copy uniform values */ for (i = 0; i < brw->vs.prog_data->base.base.nr_params; i++) { buf[offset + i] = *brw->vs.prog_data->base.base.param[i]; } } if (0) { for (i = 0; i < sz*16; i+=4) fprintf(stderr, "curbe %d.%d: %f %f %f %f\n", i/8, i&4, buf[i+0].f, buf[i+1].f, buf[i+2].f, buf[i+3].f); } /* Because this provokes an action (ie copy the constants into the * URB), it shouldn't be shortcircuited if identical to the * previous time - because eg. the urb destination may have * changed, or the urb contents different to last time. * * Note that the data referred to is actually copied internally, * not just used in place according to passed pointer. * * It appears that the CS unit takes care of using each available * URB entry (Const URB Entry == CURBE) in turn, and issuing * flushes as necessary when doublebuffering of CURBEs isn't * possible. */ emit: /* Work around mysterious 965 hangs that appear to happen if you do * two 3DPRIMITIVEs with only a CONSTANT_BUFFER inbetween. If we * haven't already flushed for some other reason, explicitly do so. * * We've found no documented reason why this should be necessary. */ if (brw->gen == 4 && !brw->is_g4x && (brw->ctx.NewDriverState & (BRW_NEW_BATCH | BRW_NEW_PSP)) == 0) { BEGIN_BATCH(1); OUT_BATCH(MI_FLUSH); ADVANCE_BATCH(); } /* BRW_NEW_URB_FENCE: From the gen4 PRM, volume 1, section 3.9.8 * (CONSTANT_BUFFER (CURBE Load)): * * "Modifying the CS URB allocation via URB_FENCE invalidates any * previous CURBE entries. Therefore software must subsequently * [re]issue a CONSTANT_BUFFER command before CURBE data can be used * in the pipeline." */ BEGIN_BATCH(2); if (brw->curbe.total_size == 0) { OUT_BATCH((CMD_CONST_BUFFER << 16) | (2 - 2)); OUT_BATCH(0); } else { OUT_BATCH((CMD_CONST_BUFFER << 16) | (1 << 8) | (2 - 2)); OUT_RELOC(brw->curbe.curbe_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, (brw->curbe.total_size - 1) + brw->curbe.curbe_offset); } ADVANCE_BATCH(); }
/** * Gathers together all the uniform values into a block of memory to be * uploaded into the CURBE, then emits the state packet telling the hardware * the new location. */ static void brw_upload_constant_buffer(struct brw_context *brw) { struct gl_context *ctx = &brw->ctx; /* BRW_NEW_CURBE_OFFSETS */ const GLuint sz = brw->curbe.total_size; const GLuint bufsz = sz * 16 * sizeof(GLfloat); gl_constant_value *buf; GLuint i; gl_clip_plane *clip_planes; if (sz == 0) { goto emit; } buf = intel_upload_space(brw, bufsz, 64, &brw->curbe.curbe_bo, &brw->curbe.curbe_offset); STATIC_ASSERT(sizeof(gl_constant_value) == sizeof(float)); /* fragment shader constants */ if (brw->curbe.wm_size) { _mesa_load_state_parameters(ctx, brw->fragment_program->Parameters); /* BRW_NEW_CURBE_OFFSETS */ GLuint offset = brw->curbe.wm_start * 16; /* BRW_NEW_FS_PROG_DATA | _NEW_PROGRAM_CONSTANTS: copy uniform values */ for (i = 0; i < brw->wm.base.prog_data->nr_params; i++) { buf[offset + i] = *brw->wm.base.prog_data->param[i]; } } /* clipper constants */ if (brw->curbe.clip_size) { GLuint offset = brw->curbe.clip_start * 16; GLbitfield mask; /* If any planes are going this way, send them all this way: */ for (i = 0; i < 6; i++) { buf[offset + i * 4 + 0].f = fixed_plane[i][0]; buf[offset + i * 4 + 1].f = fixed_plane[i][1]; buf[offset + i * 4 + 2].f = fixed_plane[i][2]; buf[offset + i * 4 + 3].f = fixed_plane[i][3]; } /* Clip planes: _NEW_TRANSFORM plus _NEW_PROJECTION to get to * clip-space: */ clip_planes = brw_select_clip_planes(ctx); mask = ctx->Transform.ClipPlanesEnabled; while (mask) { const int j = u_bit_scan(&mask); buf[offset + i * 4 + 0].f = clip_planes[j][0]; buf[offset + i * 4 + 1].f = clip_planes[j][1]; buf[offset + i * 4 + 2].f = clip_planes[j][2]; buf[offset + i * 4 + 3].f = clip_planes[j][3]; i++; } } /* vertex shader constants */ if (brw->curbe.vs_size) { _mesa_load_state_parameters(ctx, brw->vertex_program->Parameters); GLuint offset = brw->curbe.vs_start * 16; /* BRW_NEW_VS_PROG_DATA | _NEW_PROGRAM_CONSTANTS: copy uniform values */ for (i = 0; i < brw->vs.base.prog_data->nr_params; i++) { buf[offset + i] = *brw->vs.base.prog_data->param[i]; } } if (0) { for (i = 0; i < sz*16; i+=4) fprintf(stderr, "curbe %d.%d: %f %f %f %f\n", i/8, i&4, buf[i+0].f, buf[i+1].f, buf[i+2].f, buf[i+3].f); } /* Because this provokes an action (ie copy the constants into the * URB), it shouldn't be shortcircuited if identical to the * previous time - because eg. the urb destination may have * changed, or the urb contents different to last time. * * Note that the data referred to is actually copied internally, * not just used in place according to passed pointer. * * It appears that the CS unit takes care of using each available * URB entry (Const URB Entry == CURBE) in turn, and issuing * flushes as necessary when doublebuffering of CURBEs isn't * possible. */ emit: /* BRW_NEW_URB_FENCE: From the gen4 PRM, volume 1, section 3.9.8 * (CONSTANT_BUFFER (CURBE Load)): * * "Modifying the CS URB allocation via URB_FENCE invalidates any * previous CURBE entries. Therefore software must subsequently * [re]issue a CONSTANT_BUFFER command before CURBE data can be used * in the pipeline." */ BEGIN_BATCH(2); if (brw->curbe.total_size == 0) { OUT_BATCH((CMD_CONST_BUFFER << 16) | (2 - 2)); OUT_BATCH(0); } else { OUT_BATCH((CMD_CONST_BUFFER << 16) | (1 << 8) | (2 - 2)); OUT_RELOC(brw->curbe.curbe_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, (brw->curbe.total_size - 1) + brw->curbe.curbe_offset); } ADVANCE_BATCH(); /* Work around a Broadwater/Crestline depth interpolator bug. The * following sequence will cause GPU hangs: * * 1. Change state so that all depth related fields in CC_STATE are * disabled, and in WM_STATE, only "PS Use Source Depth" is enabled. * 2. Emit a CONSTANT_BUFFER packet. * 3. Draw via 3DPRIMITIVE. * * The recommended workaround is to emit a non-pipelined state change after * emitting CONSTANT_BUFFER, in order to drain the windowizer pipeline. * * We arbitrarily choose 3DSTATE_GLOBAL_DEPTH_CLAMP_OFFSET (as it's small), * and always emit it when "PS Use Source Depth" is set. We could be more * precise, but the additional complexity is probably not worth it. * * BRW_NEW_FRAGMENT_PROGRAM */ if (brw->gen == 4 && !brw->is_g4x && (brw->fragment_program->info.inputs_read & (1 << VARYING_SLOT_POS))) { BEGIN_BATCH(2); OUT_BATCH(_3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP << 16 | (2 - 2)); OUT_BATCH(0); ADVANCE_BATCH(); } }