void brw_upload_cs_prog(struct brw_context *brw) { struct gl_context *ctx = &brw->ctx; struct brw_cs_prog_key key; struct brw_compute_program *cp = (struct brw_compute_program *) brw->compute_program; if (!cp) return; if (!brw_state_dirty(brw, _NEW_TEXTURE, BRW_NEW_COMPUTE_PROGRAM)) return; brw->cs.base.sampler_count = _mesa_fls(ctx->ComputeProgram._Current->Base.SamplersUsed); brw_cs_populate_key(brw, &key); if (!brw_search_cache(&brw->cache, BRW_CACHE_CS_PROG, &key, sizeof(key), &brw->cs.base.prog_offset, &brw->cs.prog_data)) { bool success = brw_codegen_cs_prog(brw, ctx->Shader.CurrentProgram[MESA_SHADER_COMPUTE], cp, &key); (void) success; assert(success); } brw->cs.base.prog_data = &brw->cs.prog_data->base; }
static void gen7_upload_samplers(struct brw_context *brw) { struct gl_context *ctx = &brw->intel.ctx; struct gen7_sampler_state *samplers; /* BRW_NEW_VERTEX_PROGRAM and BRW_NEW_FRAGMENT_PROGRAM */ struct gl_program *vs = (struct gl_program *) brw->vertex_program; struct gl_program *fs = (struct gl_program *) brw->fragment_program; GLbitfield SamplersUsed = vs->SamplersUsed | fs->SamplersUsed; brw->sampler.count = _mesa_fls(SamplersUsed); if (brw->sampler.count == 0) return; samplers = brw_state_batch(brw, AUB_TRACE_SAMPLER_STATE, brw->sampler.count * sizeof(*samplers), 32, &brw->sampler.offset); memset(samplers, 0, brw->sampler.count * sizeof(*samplers)); for (unsigned s = 0; s < brw->sampler.count; s++) { if (SamplersUsed & (1 << s)) { const unsigned unit = (fs->SamplersUsed & (1 << s)) ? fs->SamplerUnits[s] : vs->SamplerUnits[s]; if (ctx->Texture.Unit[unit]._ReallyEnabled) gen7_update_sampler_state(brw, unit, s, &samplers[s]); } } brw->state.dirty.cache |= CACHE_NEW_SAMPLER; }
bool brw_vs_precompile(struct gl_context *ctx, struct gl_shader_program *prog) { struct brw_context *brw = brw_context(ctx); struct brw_vs_prog_key key; uint32_t old_prog_offset = brw->vs.base.prog_offset; struct brw_vs_prog_data *old_prog_data = brw->vs.prog_data; bool success; if (!prog->_LinkedShaders[MESA_SHADER_VERTEX]) return true; struct gl_vertex_program *vp = (struct gl_vertex_program *) prog->_LinkedShaders[MESA_SHADER_VERTEX]->Program; struct brw_vertex_program *bvp = brw_vertex_program(vp); memset(&key, 0, sizeof(key)); key.base.program_string_id = bvp->id; key.base.clamp_vertex_color = ctx->API == API_OPENGL_COMPAT; unsigned sampler_count = _mesa_fls(vp->Base.SamplersUsed); for (unsigned i = 0; i < sampler_count; i++) { if (vp->Base.ShadowSamplers & (1 << i)) { /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */ key.base.tex.swizzles[i] = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE); } else { /* Color sampler: assume no swizzling. */ key.base.tex.swizzles[i] = SWIZZLE_XYZW; } } success = do_vs_prog(brw, prog, bvp, &key); brw->vs.base.prog_offset = old_prog_offset; brw->vs.prog_data = old_prog_data; return success; }
/* May fail if out of video memory for texture or vbo upload, or on * fallback conditions. */ static void brw_try_draw_prims(struct gl_context *ctx, const struct gl_client_array *arrays[], const struct _mesa_prim *prims, GLuint nr_prims, const struct _mesa_index_buffer *ib, GLuint min_index, GLuint max_index, struct gl_buffer_object *indirect) { struct brw_context *brw = brw_context(ctx); GLuint i; bool fail_next = false; if (ctx->NewState) _mesa_update_state(ctx); /* Find the highest sampler unit used by each shader program. A bit-count * won't work since ARB programs use the texture unit number as the sampler * index. */ brw->wm.base.sampler_count = _mesa_fls(ctx->FragmentProgram._Current->Base.SamplersUsed); brw->gs.base.sampler_count = ctx->GeometryProgram._Current ? _mesa_fls(ctx->GeometryProgram._Current->Base.SamplersUsed) : 0; brw->vs.base.sampler_count = _mesa_fls(ctx->VertexProgram._Current->Base.SamplersUsed); /* We have to validate the textures *before* checking for fallbacks; * otherwise, the software fallback won't be able to rely on the * texture state, the firstLevel and lastLevel fields won't be * set in the intel texture object (they'll both be 0), and the * software fallback will segfault if it attempts to access any * texture level other than level 0. */ brw_validate_textures(brw); intel_prepare_render(brw); /* This workaround has to happen outside of brw_upload_render_state() * because it may flush the batchbuffer for a blit, affecting the state * flags. */ brw_workaround_depthstencil_alignment(brw, 0); /* Bind all inputs, derive varying and size information: */ brw_merge_inputs(brw, arrays); brw->ib.ib = ib; brw->ctx.NewDriverState |= BRW_NEW_INDICES; brw->vb.min_index = min_index; brw->vb.max_index = max_index; brw->ctx.NewDriverState |= BRW_NEW_VERTICES; for (i = 0; i < nr_prims; i++) { int estimated_max_prim_size; const int sampler_state_size = 16; estimated_max_prim_size = 512; /* batchbuffer commands */ estimated_max_prim_size += BRW_MAX_TEX_UNIT * (sampler_state_size + sizeof(struct gen5_sampler_default_color)); estimated_max_prim_size += 1024; /* gen6 VS push constants */ estimated_max_prim_size += 1024; /* gen6 WM push constants */ estimated_max_prim_size += 512; /* misc. pad */ /* Flush the batch if it's approaching full, so that we don't wrap while * we've got validated state that needs to be in the same batch as the * primitives. */ intel_batchbuffer_require_space(brw, estimated_max_prim_size, RENDER_RING); intel_batchbuffer_save_state(brw); if (brw->num_instances != prims[i].num_instances || brw->basevertex != prims[i].basevertex) { brw->num_instances = prims[i].num_instances; brw->basevertex = prims[i].basevertex; if (i > 0) { /* For i == 0 we just did this before the loop */ brw->ctx.NewDriverState |= BRW_NEW_VERTICES; brw_merge_inputs(brw, arrays); } } brw->draw.gl_basevertex = prims[i].indexed ? prims[i].basevertex : prims[i].start; drm_intel_bo_unreference(brw->draw.draw_params_bo); if (prims[i].is_indirect) { /* Point draw_params_bo at the indirect buffer. */ brw->draw.draw_params_bo = intel_buffer_object(ctx->DrawIndirectBuffer)->buffer; drm_intel_bo_reference(brw->draw.draw_params_bo); brw->draw.draw_params_offset = prims[i].indirect_offset + (prims[i].indexed ? 12 : 8); } else { /* Set draw_params_bo to NULL so brw_prepare_vertices knows it * has to upload gl_BaseVertex and such if they're needed. */ brw->draw.draw_params_bo = NULL; brw->draw.draw_params_offset = 0; } if (brw->gen < 6) brw_set_prim(brw, &prims[i]); else gen6_set_prim(brw, &prims[i]); retry: /* Note that before the loop, brw->ctx.NewDriverState was set to != 0, and * that the state updated in the loop outside of this block is that in * *_set_prim or intel_batchbuffer_flush(), which only impacts * brw->ctx.NewDriverState. */ if (brw->ctx.NewDriverState) { brw->no_batch_wrap = true; brw_upload_render_state(brw); } brw_emit_prim(brw, &prims[i], brw->primitive); brw->no_batch_wrap = false; if (dri_bufmgr_check_aperture_space(&brw->batch.bo, 1)) { if (!fail_next) { intel_batchbuffer_reset_to_saved(brw); intel_batchbuffer_flush(brw); fail_next = true; goto retry; } else { int ret = intel_batchbuffer_flush(brw); WARN_ONCE(ret == -ENOSPC, "i965: Single primitive emit exceeded " "available aperture space\n"); } } /* Now that we know we haven't run out of aperture space, we can safely * reset the dirty bits. */ if (brw->ctx.NewDriverState) brw_render_state_finished(brw); } if (brw->always_flush_batch) intel_batchbuffer_flush(brw); brw_state_cache_check_size(brw); brw_postdraw_set_buffers_need_resolve(brw); return; }
/* May fail if out of video memory for texture or vbo upload, or on * fallback conditions. */ static bool brw_try_draw_prims( struct gl_context *ctx, const struct gl_client_array *arrays[], const struct _mesa_prim *prims, GLuint nr_prims, const struct _mesa_index_buffer *ib, GLuint min_index, GLuint max_index, struct gl_buffer_object *indirect) { struct brw_context *brw = brw_context(ctx); bool retval = true; GLuint i; bool fail_next = false; if (ctx->NewState) _mesa_update_state( ctx ); /* Find the highest sampler unit used by each shader program. A bit-count * won't work since ARB programs use the texture unit number as the sampler * index. */ brw->wm.base.sampler_count = _mesa_fls(ctx->FragmentProgram._Current->Base.SamplersUsed); brw->gs.base.sampler_count = ctx->GeometryProgram._Current ? _mesa_fls(ctx->GeometryProgram._Current->Base.SamplersUsed) : 0; brw->vs.base.sampler_count = _mesa_fls(ctx->VertexProgram._Current->Base.SamplersUsed); /* We have to validate the textures *before* checking for fallbacks; * otherwise, the software fallback won't be able to rely on the * texture state, the firstLevel and lastLevel fields won't be * set in the intel texture object (they'll both be 0), and the * software fallback will segfault if it attempts to access any * texture level other than level 0. */ brw_validate_textures( brw ); intel_prepare_render(brw); /* This workaround has to happen outside of brw_upload_state() because it * may flush the batchbuffer for a blit, affecting the state flags. */ brw_workaround_depthstencil_alignment(brw, 0); /* Resolves must occur after updating renderbuffers, updating context state, * and finalizing textures but before setting up any hardware state for * this draw call. */ brw_predraw_resolve_buffers(brw); /* Bind all inputs, derive varying and size information: */ brw_merge_inputs( brw, arrays ); brw->ib.ib = ib; brw->state.dirty.brw |= BRW_NEW_INDICES; brw->vb.min_index = min_index; brw->vb.max_index = max_index; brw->state.dirty.brw |= BRW_NEW_VERTICES; for (i = 0; i < nr_prims; i++) { int estimated_max_prim_size; estimated_max_prim_size = 512; /* batchbuffer commands */ estimated_max_prim_size += (BRW_MAX_TEX_UNIT * (sizeof(struct brw_sampler_state) + sizeof(struct gen5_sampler_default_color))); estimated_max_prim_size += 1024; /* gen6 VS push constants */ estimated_max_prim_size += 1024; /* gen6 WM push constants */ estimated_max_prim_size += 512; /* misc. pad */ /* Flush the batch if it's approaching full, so that we don't wrap while * we've got validated state that needs to be in the same batch as the * primitives. */ intel_batchbuffer_require_space(brw, estimated_max_prim_size, RENDER_RING); intel_batchbuffer_save_state(brw); if (brw->num_instances != prims[i].num_instances) { brw->num_instances = prims[i].num_instances; brw->state.dirty.brw |= BRW_NEW_VERTICES; brw_merge_inputs(brw, arrays); } if (brw->basevertex != prims[i].basevertex) { brw->basevertex = prims[i].basevertex; brw->state.dirty.brw |= BRW_NEW_VERTICES; brw_merge_inputs(brw, arrays); } if (brw->gen < 6) brw_set_prim(brw, &prims[i]); else gen6_set_prim(brw, &prims[i]); retry: /* Note that before the loop, brw->state.dirty.brw was set to != 0, and * that the state updated in the loop outside of this block is that in * *_set_prim or intel_batchbuffer_flush(), which only impacts * brw->state.dirty.brw. */ if (brw->state.dirty.brw) { brw->no_batch_wrap = true; brw_upload_state(brw); } brw_emit_prim(brw, &prims[i], brw->primitive); brw->no_batch_wrap = false; if (dri_bufmgr_check_aperture_space(&brw->batch.bo, 1)) { if (!fail_next) { intel_batchbuffer_reset_to_saved(brw); intel_batchbuffer_flush(brw); fail_next = true; goto retry; } else { if (intel_batchbuffer_flush(brw) == -ENOSPC) { static bool warned = false; if (!warned) { fprintf(stderr, "i965: Single primitive emit exceeded" "available aperture space\n"); warned = true; } retval = false; } } } } if (brw->always_flush_batch) intel_batchbuffer_flush(brw); brw_state_cache_check_size(brw); brw_postdraw_set_buffers_need_resolve(brw); return retval; }
/** * Write out a batch of 32 control data bits from the control_data_bits * register to the URB. * * The current value of the vertex_count register determines which DWORD in * the URB receives the control data bits. The control_data_bits register is * assumed to contain the correct data for the vertex that was most recently * output, and all previous vertices that share the same DWORD. * * This function takes care of ensuring that if no vertices have been output * yet, no control bits are emitted. */ void vec4_gs_visitor::emit_control_data_bits() { assert(c->control_data_bits_per_vertex != 0); /* Since the URB_WRITE_OWORD message operates with 128-bit (vec4 sized) * granularity, we need to use two tricks to ensure that the batch of 32 * control data bits is written to the appropriate DWORD in the URB. To * select which vec4 we are writing to, we use the "slot {0,1} offset" * fields of the message header. To select which DWORD in the vec4 we are * writing to, we use the channel mask fields of the message header. To * avoid penalizing geometry shaders that emit a small number of vertices * with extra bookkeeping, we only do each of these tricks when * c->prog_data.control_data_header_size_bits is large enough to make it * necessary. * * Note: this means that if we're outputting just a single DWORD of control * data bits, we'll actually replicate it four times since we won't do any * channel masking. But that's not a problem since in this case the * hardware only pays attention to the first DWORD. */ enum brw_urb_write_flags urb_write_flags = BRW_URB_WRITE_OWORD; if (c->control_data_header_size_bits > 32) urb_write_flags = urb_write_flags | BRW_URB_WRITE_USE_CHANNEL_MASKS; if (c->control_data_header_size_bits > 128) urb_write_flags = urb_write_flags | BRW_URB_WRITE_PER_SLOT_OFFSET; /* If vertex_count is 0, then no control data bits have been accumulated * yet, so we should do nothing. */ emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_NEQ)); emit(IF(BRW_PREDICATE_NORMAL)); { /* If we are using either channel masks or a per-slot offset, then we * need to figure out which DWORD we are trying to write to, using the * formula: * * dword_index = (vertex_count - 1) * bits_per_vertex / 32 * * Since bits_per_vertex is a power of two, and is known at compile * time, this can be optimized to: * * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex)) */ src_reg dword_index(this, glsl_type::uint_type); if (urb_write_flags) { src_reg prev_count(this, glsl_type::uint_type); emit(ADD(dst_reg(prev_count), this->vertex_count, 0xffffffffu)); unsigned log2_bits_per_vertex = _mesa_fls(c->control_data_bits_per_vertex); emit(SHR(dst_reg(dword_index), prev_count, (uint32_t) (6 - log2_bits_per_vertex))); } /* Start building the URB write message. The first MRF gets a copy of * R0. */ int base_mrf = 1; dst_reg mrf_reg(MRF, base_mrf); src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); vec4_instruction *inst = emit(MOV(mrf_reg, r0)); inst->force_writemask_all = true; if (urb_write_flags & BRW_URB_WRITE_PER_SLOT_OFFSET) { /* Set the per-slot offset to dword_index / 4, to that we'll write to * the appropriate OWORD within the control data header. */ src_reg per_slot_offset(this, glsl_type::uint_type); emit(SHR(dst_reg(per_slot_offset), dword_index, 2u)); emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset, 1u); } if (urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS) { /* Set the channel masks to 1 << (dword_index % 4), so that we'll * write to the appropriate DWORD within the OWORD. We need to do * this computation with force_writemask_all, otherwise garbage data * from invocation 0 might clobber the mask for invocation 1 when * GS_OPCODE_PREPARE_CHANNEL_MASKS tries to OR the two masks * together. */ src_reg channel(this, glsl_type::uint_type); inst = emit(AND(dst_reg(channel), dword_index, 3u)); inst->force_writemask_all = true; src_reg one(this, glsl_type::uint_type); inst = emit(MOV(dst_reg(one), 1u)); inst->force_writemask_all = true; src_reg channel_mask(this, glsl_type::uint_type); inst = emit(SHL(dst_reg(channel_mask), one, channel)); inst->force_writemask_all = true; emit(GS_OPCODE_PREPARE_CHANNEL_MASKS, dst_reg(channel_mask), channel_mask); emit(GS_OPCODE_SET_CHANNEL_MASKS, mrf_reg, channel_mask); } /* Store the control data bits in the message payload and send it. */ dst_reg mrf_reg2(MRF, base_mrf + 1); inst = emit(MOV(mrf_reg2, this->control_data_bits)); inst->force_writemask_all = true; inst = emit(GS_OPCODE_URB_WRITE); inst->urb_write_flags = urb_write_flags; /* We need to increment Global Offset by 256-bits to make room for * Broadwell's extra "Vertex Count" payload at the beginning of the * URB entry. Since this is an OWord message, Global Offset is counted * in 128-bit units, so we must set it to 2. */ if (brw->gen >= 8) inst->offset = 2; inst->base_mrf = base_mrf; inst->mlen = 2; } emit(BRW_OPCODE_ENDIF); }