/** * Handy interface to upload some data to temporary GPU memory quickly. * * References to this memory should not be retained across batch flushes. */ void intel_upload_data(struct brw_context *brw, const void *data, uint32_t size, uint32_t alignment, drm_intel_bo **out_bo, uint32_t *out_offset) { void *dst = intel_upload_space(brw, size, alignment, out_bo, out_offset); memcpy(dst, data, size); }
/** * Creates a temporary BO containing the pull constant data for the shader * stage, and the SURFACE_STATE struct that points at it. * * Pull constants are GLSL uniforms (and other constant data) beyond what we * could fit as push constants, or that have variable-index array access * (which is easiest to support using pull constants, and avoids filling * register space with mostly-unused data). * * Compare this path to brw_curbe.c for gen4/5 push constants, and * gen6_vs_state.c for gen6+ push constants. */ void brw_upload_pull_constants(struct brw_context *brw, GLbitfield brw_new_constbuf, const struct gl_program *prog, struct brw_stage_state *stage_state, const struct brw_stage_prog_data *prog_data, bool dword_pitch) { int i; uint32_t surf_index = prog_data->binding_table.pull_constants_start; if (!prog_data->nr_pull_params) { if (stage_state->surf_offset[surf_index]) { stage_state->surf_offset[surf_index] = 0; brw->ctx.NewDriverState |= brw_new_constbuf; } return; } /* Updates the ParamaterValues[i] pointers for all parameters of the * basic type of PROGRAM_STATE_VAR. */ _mesa_load_state_parameters(&brw->ctx, prog->Parameters); /* BRW_NEW_*_PROG_DATA | _NEW_PROGRAM_CONSTANTS */ uint32_t size = prog_data->nr_pull_params * 4; drm_intel_bo *const_bo = NULL; uint32_t const_offset; gl_constant_value *constants = intel_upload_space(brw, size, 64, &const_bo, &const_offset); STATIC_ASSERT(sizeof(gl_constant_value) == sizeof(float)); for (i = 0; i < prog_data->nr_pull_params; i++) { constants[i] = *prog_data->pull_param[i]; } if (0) { for (i = 0; i < ALIGN(prog_data->nr_pull_params, 4) / 4; i++) { const gl_constant_value *row = &constants[i * 4]; fprintf(stderr, "const surface %3d: %4.3f %4.3f %4.3f %4.3f\n", i, row[0].f, row[1].f, row[2].f, row[3].f); } } brw_create_constant_surface(brw, const_bo, const_offset, size, &stage_state->surf_offset[surf_index], dword_pitch); drm_intel_bo_unreference(const_bo); brw->ctx.NewDriverState |= brw_new_constbuf; }
static void copy_array_to_vbo_array(struct brw_context *brw, struct brw_vertex_element *element, int min, int max, struct brw_vertex_buffer *buffer, GLuint dst_stride) { const int src_stride = element->glarray->StrideB; /* If the source stride is zero, we just want to upload the current * attribute once and set the buffer's stride to 0. There's no need * to replicate it out. */ if (src_stride == 0) { intel_upload_data(brw, element->glarray->Ptr, element->glarray->_ElementSize, element->glarray->_ElementSize, &buffer->bo, &buffer->offset); buffer->stride = 0; buffer->size = element->glarray->_ElementSize; return; } const unsigned char *src = element->glarray->Ptr + min * src_stride; int count = max - min + 1; GLuint size = count * dst_stride; uint8_t *dst = intel_upload_space(brw, size, dst_stride, &buffer->bo, &buffer->offset); if (dst_stride == src_stride) { memcpy(dst, src, size); } else { while (count--) { memcpy(dst, src, dst_stride); src += src_stride; dst += dst_stride; } } buffer->stride = dst_stride; buffer->size = size; }
/** * Gathers together all the uniform values into a block of memory to be * uploaded into the CURBE, then emits the state packet telling the hardware * the new location. */ static void brw_upload_constant_buffer(struct brw_context *brw) { struct gl_context *ctx = &brw->ctx; /* BRW_NEW_CURBE_OFFSETS */ const GLuint sz = brw->curbe.total_size; const GLuint bufsz = sz * 16 * sizeof(GLfloat); gl_constant_value *buf; GLuint i; gl_clip_plane *clip_planes; if (sz == 0) { goto emit; } buf = intel_upload_space(brw, bufsz, 64, &brw->curbe.curbe_bo, &brw->curbe.curbe_offset); STATIC_ASSERT(sizeof(gl_constant_value) == sizeof(float)); /* fragment shader constants */ if (brw->curbe.wm_size) { _mesa_load_state_parameters(ctx, brw->fragment_program->Base.Parameters); /* BRW_NEW_CURBE_OFFSETS */ GLuint offset = brw->curbe.wm_start * 16; /* BRW_NEW_FS_PROG_DATA | _NEW_PROGRAM_CONSTANTS: copy uniform values */ for (i = 0; i < brw->wm.prog_data->base.nr_params; i++) { buf[offset + i] = *brw->wm.prog_data->base.param[i]; } } /* clipper constants */ if (brw->curbe.clip_size) { GLuint offset = brw->curbe.clip_start * 16; GLuint j; /* If any planes are going this way, send them all this way: */ for (i = 0; i < 6; i++) { buf[offset + i * 4 + 0].f = fixed_plane[i][0]; buf[offset + i * 4 + 1].f = fixed_plane[i][1]; buf[offset + i * 4 + 2].f = fixed_plane[i][2]; buf[offset + i * 4 + 3].f = fixed_plane[i][3]; } /* Clip planes: _NEW_TRANSFORM plus _NEW_PROJECTION to get to * clip-space: */ clip_planes = brw_select_clip_planes(ctx); for (j = 0; j < MAX_CLIP_PLANES; j++) { if (ctx->Transform.ClipPlanesEnabled & (1<<j)) { buf[offset + i * 4 + 0].f = clip_planes[j][0]; buf[offset + i * 4 + 1].f = clip_planes[j][1]; buf[offset + i * 4 + 2].f = clip_planes[j][2]; buf[offset + i * 4 + 3].f = clip_planes[j][3]; i++; } } } /* vertex shader constants */ if (brw->curbe.vs_size) { _mesa_load_state_parameters(ctx, brw->vertex_program->Base.Parameters); GLuint offset = brw->curbe.vs_start * 16; /* BRW_NEW_VS_PROG_DATA | _NEW_PROGRAM_CONSTANTS: copy uniform values */ for (i = 0; i < brw->vs.prog_data->base.base.nr_params; i++) { buf[offset + i] = *brw->vs.prog_data->base.base.param[i]; } } if (0) { for (i = 0; i < sz*16; i+=4) fprintf(stderr, "curbe %d.%d: %f %f %f %f\n", i/8, i&4, buf[i+0].f, buf[i+1].f, buf[i+2].f, buf[i+3].f); } /* Because this provokes an action (ie copy the constants into the * URB), it shouldn't be shortcircuited if identical to the * previous time - because eg. the urb destination may have * changed, or the urb contents different to last time. * * Note that the data referred to is actually copied internally, * not just used in place according to passed pointer. * * It appears that the CS unit takes care of using each available * URB entry (Const URB Entry == CURBE) in turn, and issuing * flushes as necessary when doublebuffering of CURBEs isn't * possible. */ emit: /* Work around mysterious 965 hangs that appear to happen if you do * two 3DPRIMITIVEs with only a CONSTANT_BUFFER inbetween. If we * haven't already flushed for some other reason, explicitly do so. * * We've found no documented reason why this should be necessary. */ if (brw->gen == 4 && !brw->is_g4x && (brw->ctx.NewDriverState & (BRW_NEW_BATCH | BRW_NEW_PSP)) == 0) { BEGIN_BATCH(1); OUT_BATCH(MI_FLUSH); ADVANCE_BATCH(); } /* BRW_NEW_URB_FENCE: From the gen4 PRM, volume 1, section 3.9.8 * (CONSTANT_BUFFER (CURBE Load)): * * "Modifying the CS URB allocation via URB_FENCE invalidates any * previous CURBE entries. Therefore software must subsequently * [re]issue a CONSTANT_BUFFER command before CURBE data can be used * in the pipeline." */ BEGIN_BATCH(2); if (brw->curbe.total_size == 0) { OUT_BATCH((CMD_CONST_BUFFER << 16) | (2 - 2)); OUT_BATCH(0); } else { OUT_BATCH((CMD_CONST_BUFFER << 16) | (1 << 8) | (2 - 2)); OUT_RELOC(brw->curbe.curbe_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, (brw->curbe.total_size - 1) + brw->curbe.curbe_offset); } ADVANCE_BATCH(); }
/** * Gathers together all the uniform values into a block of memory to be * uploaded into the CURBE, then emits the state packet telling the hardware * the new location. */ static void brw_upload_constant_buffer(struct brw_context *brw) { struct gl_context *ctx = &brw->ctx; /* BRW_NEW_CURBE_OFFSETS */ const GLuint sz = brw->curbe.total_size; const GLuint bufsz = sz * 16 * sizeof(GLfloat); gl_constant_value *buf; GLuint i; gl_clip_plane *clip_planes; if (sz == 0) { goto emit; } buf = intel_upload_space(brw, bufsz, 64, &brw->curbe.curbe_bo, &brw->curbe.curbe_offset); STATIC_ASSERT(sizeof(gl_constant_value) == sizeof(float)); /* fragment shader constants */ if (brw->curbe.wm_size) { _mesa_load_state_parameters(ctx, brw->fragment_program->Parameters); /* BRW_NEW_CURBE_OFFSETS */ GLuint offset = brw->curbe.wm_start * 16; /* BRW_NEW_FS_PROG_DATA | _NEW_PROGRAM_CONSTANTS: copy uniform values */ for (i = 0; i < brw->wm.base.prog_data->nr_params; i++) { buf[offset + i] = *brw->wm.base.prog_data->param[i]; } } /* clipper constants */ if (brw->curbe.clip_size) { GLuint offset = brw->curbe.clip_start * 16; GLbitfield mask; /* If any planes are going this way, send them all this way: */ for (i = 0; i < 6; i++) { buf[offset + i * 4 + 0].f = fixed_plane[i][0]; buf[offset + i * 4 + 1].f = fixed_plane[i][1]; buf[offset + i * 4 + 2].f = fixed_plane[i][2]; buf[offset + i * 4 + 3].f = fixed_plane[i][3]; } /* Clip planes: _NEW_TRANSFORM plus _NEW_PROJECTION to get to * clip-space: */ clip_planes = brw_select_clip_planes(ctx); mask = ctx->Transform.ClipPlanesEnabled; while (mask) { const int j = u_bit_scan(&mask); buf[offset + i * 4 + 0].f = clip_planes[j][0]; buf[offset + i * 4 + 1].f = clip_planes[j][1]; buf[offset + i * 4 + 2].f = clip_planes[j][2]; buf[offset + i * 4 + 3].f = clip_planes[j][3]; i++; } } /* vertex shader constants */ if (brw->curbe.vs_size) { _mesa_load_state_parameters(ctx, brw->vertex_program->Parameters); GLuint offset = brw->curbe.vs_start * 16; /* BRW_NEW_VS_PROG_DATA | _NEW_PROGRAM_CONSTANTS: copy uniform values */ for (i = 0; i < brw->vs.base.prog_data->nr_params; i++) { buf[offset + i] = *brw->vs.base.prog_data->param[i]; } } if (0) { for (i = 0; i < sz*16; i+=4) fprintf(stderr, "curbe %d.%d: %f %f %f %f\n", i/8, i&4, buf[i+0].f, buf[i+1].f, buf[i+2].f, buf[i+3].f); } /* Because this provokes an action (ie copy the constants into the * URB), it shouldn't be shortcircuited if identical to the * previous time - because eg. the urb destination may have * changed, or the urb contents different to last time. * * Note that the data referred to is actually copied internally, * not just used in place according to passed pointer. * * It appears that the CS unit takes care of using each available * URB entry (Const URB Entry == CURBE) in turn, and issuing * flushes as necessary when doublebuffering of CURBEs isn't * possible. */ emit: /* BRW_NEW_URB_FENCE: From the gen4 PRM, volume 1, section 3.9.8 * (CONSTANT_BUFFER (CURBE Load)): * * "Modifying the CS URB allocation via URB_FENCE invalidates any * previous CURBE entries. Therefore software must subsequently * [re]issue a CONSTANT_BUFFER command before CURBE data can be used * in the pipeline." */ BEGIN_BATCH(2); if (brw->curbe.total_size == 0) { OUT_BATCH((CMD_CONST_BUFFER << 16) | (2 - 2)); OUT_BATCH(0); } else { OUT_BATCH((CMD_CONST_BUFFER << 16) | (1 << 8) | (2 - 2)); OUT_RELOC(brw->curbe.curbe_bo, I915_GEM_DOMAIN_INSTRUCTION, 0, (brw->curbe.total_size - 1) + brw->curbe.curbe_offset); } ADVANCE_BATCH(); /* Work around a Broadwater/Crestline depth interpolator bug. The * following sequence will cause GPU hangs: * * 1. Change state so that all depth related fields in CC_STATE are * disabled, and in WM_STATE, only "PS Use Source Depth" is enabled. * 2. Emit a CONSTANT_BUFFER packet. * 3. Draw via 3DPRIMITIVE. * * The recommended workaround is to emit a non-pipelined state change after * emitting CONSTANT_BUFFER, in order to drain the windowizer pipeline. * * We arbitrarily choose 3DSTATE_GLOBAL_DEPTH_CLAMP_OFFSET (as it's small), * and always emit it when "PS Use Source Depth" is set. We could be more * precise, but the additional complexity is probably not worth it. * * BRW_NEW_FRAGMENT_PROGRAM */ if (brw->gen == 4 && !brw->is_g4x && (brw->fragment_program->info.inputs_read & (1 << VARYING_SLOT_POS))) { BEGIN_BATCH(2); OUT_BATCH(_3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP << 16 | (2 - 2)); OUT_BATCH(0); ADVANCE_BATCH(); } }
/** * Creates a streamed BO containing the push constants for the VS or GS on * gen6+. * * Push constants are constant values (such as GLSL uniforms) that are * pre-loaded into a shader stage's register space at thread spawn time. * * Not all GLSL uniforms will be uploaded as push constants: The hardware has * a limitation of 32 or 64 EU registers (256 or 512 floats) per stage to be * uploaded as push constants, while GL 4.4 requires at least 1024 components * to be usable for the VS. Plus, currently we always use pull constants * instead of push constants when doing variable-index array access. * * See brw_curbe.c for the equivalent gen4/5 code. */ void gen6_upload_push_constants(struct brw_context *brw, const struct gl_program *prog, const struct brw_stage_prog_data *prog_data, struct brw_stage_state *stage_state) { struct gl_context *ctx = &brw->ctx; if (prog_data->nr_params == 0) { stage_state->push_const_size = 0; } else { /* Updates the ParamaterValues[i] pointers for all parameters of the * basic type of PROGRAM_STATE_VAR. */ /* XXX: Should this happen somewhere before to get our state flag set? */ if (prog) _mesa_load_state_parameters(ctx, prog->Parameters); int i; const int size = prog_data->nr_params * sizeof(gl_constant_value); gl_constant_value *param; if (brw->gen >= 8 || brw->is_haswell) { param = intel_upload_space(brw, size, 32, &stage_state->push_const_bo, &stage_state->push_const_offset); } else { param = brw_state_batch(brw, size, 32, &stage_state->push_const_offset); } STATIC_ASSERT(sizeof(gl_constant_value) == sizeof(float)); /* _NEW_PROGRAM_CONSTANTS * * Also _NEW_TRANSFORM -- we may reference clip planes other than as a * side effect of dereferencing uniforms, so _NEW_PROGRAM_CONSTANTS * wouldn't be set for them. */ for (i = 0; i < prog_data->nr_params; i++) { param[i] = *prog_data->param[i]; } if (0) { fprintf(stderr, "%s constants:\n", _mesa_shader_stage_to_string(stage_state->stage)); for (i = 0; i < prog_data->nr_params; i++) { if ((i & 7) == 0) fprintf(stderr, "g%d: ", prog_data->dispatch_grf_start_reg + i / 8); fprintf(stderr, "%8f ", param[i].f); if ((i & 7) == 7) fprintf(stderr, "\n"); } if ((i & 7) != 0) fprintf(stderr, "\n"); fprintf(stderr, "\n"); } stage_state->push_const_size = ALIGN(prog_data->nr_params, 8) / 8; /* We can only push 32 registers of constants at a time. */ /* From the SNB PRM (vol2, part 1, section 3.2.1.4: 3DSTATE_CONSTANT_VS: * * "The sum of all four read length fields (each incremented to * represent the actual read length) must be less than or equal to * 32" * * From the IVB PRM (vol2, part 1, section 3.2.1.3: 3DSTATE_CONSTANT_VS: * * "The sum of all four read length fields must be less than or * equal to the size of 64" * * The other shader stages all match the VS's limits. */ assert(stage_state->push_const_size <= 32); } stage_state->push_constants_dirty = true; }