void gen6_gs_visitor::visit(ir_end_primitive *) { this->current_annotation = "gen6 end primitive"; /* Calling EndPrimitive() is optional for point output. In this case we set * the PrimEnd flag when we process EmitVertex(). */ if (c->gp->program.OutputType == GL_POINTS) return; /* Otherwise we know that the last vertex we have processed was the last * vertex in the primitive and we need to set its PrimEnd flag, so do this * unless we haven't emitted that vertex at all (vertex_count != 0). * * Notice that we have already incremented vertex_count when we processed * the last emit_vertex, so we need to take that into account in the * comparison below (hence the num_output_vertices + 1 in the comparison * below). */ unsigned num_output_vertices = c->gp->program.VerticesOut; emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices + 1), BRW_CONDITIONAL_L)); vec4_instruction *inst = emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_NEQ)); inst->predicate = BRW_PREDICATE_NORMAL; emit(IF(BRW_PREDICATE_NORMAL)); { /* vertex_output_offset is already pointing at the first entry of the * next vertex. So subtract 1 to modify the flags for the previous * vertex. */ src_reg offset(this, glsl_type::uint_type); emit(ADD(dst_reg(offset), this->vertex_output_offset, src_reg(-1))); src_reg dst(this->vertex_output); dst.reladdr = ralloc(mem_ctx, src_reg); memcpy(dst.reladdr, &offset, sizeof(src_reg)); emit(OR(dst_reg(dst), dst, URB_WRITE_PRIM_END)); emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u)); /* Set the first vertex flag to indicate that the next vertex will start * a primitive. */ emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START)); } emit(BRW_OPCODE_ENDIF); }
void vec4_tcs_visitor::emit_thread_end() { vec4_instruction *inst; current_annotation = "thread end"; if (nir->info->tcs.vertices_out % 2) { emit(BRW_OPCODE_ENDIF); } if (devinfo->gen == 7) { struct brw_tcs_prog_data *tcs_prog_data = (struct brw_tcs_prog_data *) prog_data; current_annotation = "release input vertices"; /* Synchronize all threads, so we know that no one is still * using the input URB handles. */ if (tcs_prog_data->instances > 1) { dst_reg header = dst_reg(this, glsl_type::uvec4_type); emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header); emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header)); } /* Make thread 0 (invocations <1, 0>) release pairs of ICP handles. * We want to compare the bottom half of invocation_id with 0, but * use that truth value for the top half as well. Unfortunately, * we don't have stride in the vec4 world, nor UV immediates in * align16, so we need an opcode to get invocation_id<0,4,0>. */ set_condmod(BRW_CONDITIONAL_Z, emit(TCS_OPCODE_SRC0_010_IS_ZERO, dst_null_d(), invocation_id)); emit(IF(BRW_PREDICATE_NORMAL)); for (unsigned i = 0; i < key->input_vertices; i += 2) { /* If we have an odd number of input vertices, the last will be * unpaired. We don't want to use an interleaved URB write in * that case. */ const bool is_unpaired = i == key->input_vertices - 1; dst_reg header(this, glsl_type::uvec4_type); emit(TCS_OPCODE_RELEASE_INPUT, header, brw_imm_ud(i), brw_imm_ud(is_unpaired)); } emit(BRW_OPCODE_ENDIF); } if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME)) emit_shader_time_end(); inst = emit(TCS_OPCODE_THREAD_END); inst->base_mrf = 14; inst->mlen = 2; }
void vec4_tcs_visitor::emit_prolog() { invocation_id = src_reg(this, glsl_type::uint_type); emit(TCS_OPCODE_GET_INSTANCE_ID, dst_reg(invocation_id)); /* HS threads are dispatched with the dispatch mask set to 0xFF. * If there are an odd number of output vertices, then the final * HS instance dispatched will only have its bottom half doing real * work, and so we need to disable the upper half: */ if (nir->info->tcs.vertices_out % 2) { emit(CMP(dst_null_d(), invocation_id, brw_imm_ud(nir->info->tcs.vertices_out), BRW_CONDITIONAL_L)); /* Matching ENDIF is in emit_thread_end() */ emit(IF(BRW_PREDICATE_NORMAL)); } }
void vec4_gs_visitor::visit(ir_emit_vertex *ir) { this->current_annotation = "emit vertex: safety check"; /* To ensure that we don't output more vertices than the shader specified * using max_vertices, do the logic inside a conditional of the form "if * (vertex_count < MAX)" */ unsigned num_output_vertices = c->gp->program.VerticesOut; emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices), BRW_CONDITIONAL_L)); emit(IF(BRW_PREDICATE_NORMAL)); { /* If we're outputting 32 control data bits or less, then we can wait * until the shader is over to output them all. Otherwise we need to * output them as we go. Now is the time to do it, since we're about to * output the vertex_count'th vertex, so it's guaranteed that the * control data bits associated with the (vertex_count - 1)th vertex are * correct. */ if (c->control_data_header_size_bits > 32) { this->current_annotation = "emit vertex: emit control data bits"; /* Only emit control data bits if we've finished accumulating a batch * of 32 bits. This is the case when: * * (vertex_count * bits_per_vertex) % 32 == 0 * * (in other words, when the last 5 bits of vertex_count * * bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some * integer n (which is always the case, since bits_per_vertex is * always 1 or 2), this is equivalent to requiring that the last 5-n * bits of vertex_count are 0: * * vertex_count & (2^(5-n) - 1) == 0 * * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is * equivalent to: * * vertex_count & (32 / bits_per_vertex - 1) == 0 */ vec4_instruction *inst = emit(AND(dst_null_d(), this->vertex_count, (uint32_t) (32 / c->control_data_bits_per_vertex - 1))); inst->conditional_mod = BRW_CONDITIONAL_Z; emit(IF(BRW_PREDICATE_NORMAL)); { emit_control_data_bits(); /* Reset control_data_bits to 0 so we can start accumulating a new * batch. * * Note: in the case where vertex_count == 0, this neutralizes the * effect of any call to EndPrimitive() that the shader may have * made before outputting its first vertex. */ inst = emit(MOV(dst_reg(this->control_data_bits), 0u)); inst->force_writemask_all = true; } emit(BRW_OPCODE_ENDIF); } this->current_annotation = "emit vertex: vertex data"; emit_vertex(); /* In stream mode we have to set control data bits for all vertices * unless we have disabled control data bits completely (which we do * do for GL_POINTS outputs that don't use streams). */ if (c->control_data_header_size_bits > 0 && c->prog_data.control_data_format == GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) { this->current_annotation = "emit vertex: Stream control data bits"; set_stream_control_data_bits(ir->stream_id()); } this->current_annotation = "emit vertex: increment vertex count"; emit(ADD(dst_reg(this->vertex_count), this->vertex_count, src_reg(1u))); } emit(BRW_OPCODE_ENDIF); this->current_annotation = NULL; }
/** * Write out a batch of 32 control data bits from the control_data_bits * register to the URB. * * The current value of the vertex_count register determines which DWORD in * the URB receives the control data bits. The control_data_bits register is * assumed to contain the correct data for the vertex that was most recently * output, and all previous vertices that share the same DWORD. * * This function takes care of ensuring that if no vertices have been output * yet, no control bits are emitted. */ void vec4_gs_visitor::emit_control_data_bits() { assert(c->control_data_bits_per_vertex != 0); /* Since the URB_WRITE_OWORD message operates with 128-bit (vec4 sized) * granularity, we need to use two tricks to ensure that the batch of 32 * control data bits is written to the appropriate DWORD in the URB. To * select which vec4 we are writing to, we use the "slot {0,1} offset" * fields of the message header. To select which DWORD in the vec4 we are * writing to, we use the channel mask fields of the message header. To * avoid penalizing geometry shaders that emit a small number of vertices * with extra bookkeeping, we only do each of these tricks when * c->prog_data.control_data_header_size_bits is large enough to make it * necessary. * * Note: this means that if we're outputting just a single DWORD of control * data bits, we'll actually replicate it four times since we won't do any * channel masking. But that's not a problem since in this case the * hardware only pays attention to the first DWORD. */ enum brw_urb_write_flags urb_write_flags = BRW_URB_WRITE_OWORD; if (c->control_data_header_size_bits > 32) urb_write_flags = urb_write_flags | BRW_URB_WRITE_USE_CHANNEL_MASKS; if (c->control_data_header_size_bits > 128) urb_write_flags = urb_write_flags | BRW_URB_WRITE_PER_SLOT_OFFSET; /* If vertex_count is 0, then no control data bits have been accumulated * yet, so we should do nothing. */ emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_NEQ)); emit(IF(BRW_PREDICATE_NORMAL)); { /* If we are using either channel masks or a per-slot offset, then we * need to figure out which DWORD we are trying to write to, using the * formula: * * dword_index = (vertex_count - 1) * bits_per_vertex / 32 * * Since bits_per_vertex is a power of two, and is known at compile * time, this can be optimized to: * * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex)) */ src_reg dword_index(this, glsl_type::uint_type); if (urb_write_flags) { src_reg prev_count(this, glsl_type::uint_type); emit(ADD(dst_reg(prev_count), this->vertex_count, 0xffffffffu)); unsigned log2_bits_per_vertex = _mesa_fls(c->control_data_bits_per_vertex); emit(SHR(dst_reg(dword_index), prev_count, (uint32_t) (6 - log2_bits_per_vertex))); } /* Start building the URB write message. The first MRF gets a copy of * R0. */ int base_mrf = 1; dst_reg mrf_reg(MRF, base_mrf); src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); vec4_instruction *inst = emit(MOV(mrf_reg, r0)); inst->force_writemask_all = true; if (urb_write_flags & BRW_URB_WRITE_PER_SLOT_OFFSET) { /* Set the per-slot offset to dword_index / 4, to that we'll write to * the appropriate OWORD within the control data header. */ src_reg per_slot_offset(this, glsl_type::uint_type); emit(SHR(dst_reg(per_slot_offset), dword_index, 2u)); emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset, 1u); } if (urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS) { /* Set the channel masks to 1 << (dword_index % 4), so that we'll * write to the appropriate DWORD within the OWORD. We need to do * this computation with force_writemask_all, otherwise garbage data * from invocation 0 might clobber the mask for invocation 1 when * GS_OPCODE_PREPARE_CHANNEL_MASKS tries to OR the two masks * together. */ src_reg channel(this, glsl_type::uint_type); inst = emit(AND(dst_reg(channel), dword_index, 3u)); inst->force_writemask_all = true; src_reg one(this, glsl_type::uint_type); inst = emit(MOV(dst_reg(one), 1u)); inst->force_writemask_all = true; src_reg channel_mask(this, glsl_type::uint_type); inst = emit(SHL(dst_reg(channel_mask), one, channel)); inst->force_writemask_all = true; emit(GS_OPCODE_PREPARE_CHANNEL_MASKS, dst_reg(channel_mask), channel_mask); emit(GS_OPCODE_SET_CHANNEL_MASKS, mrf_reg, channel_mask); } /* Store the control data bits in the message payload and send it. */ dst_reg mrf_reg2(MRF, base_mrf + 1); inst = emit(MOV(mrf_reg2, this->control_data_bits)); inst->force_writemask_all = true; inst = emit(GS_OPCODE_URB_WRITE); inst->urb_write_flags = urb_write_flags; /* We need to increment Global Offset by 256-bits to make room for * Broadwell's extra "Vertex Count" payload at the beginning of the * URB entry. Since this is an OWord message, Global Offset is counted * in 128-bit units, so we must set it to 2. */ if (brw->gen >= 8) inst->offset = 2; inst->base_mrf = base_mrf; inst->mlen = 2; } emit(BRW_OPCODE_ENDIF); }
void gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts) { struct brw_gs_prog_data *prog_data = (struct brw_gs_prog_data *) &c->prog_data; unsigned binding; unsigned num_bindings = prog_data->num_transform_feedback_bindings; src_reg sol_temp(this, glsl_type::uvec4_type); /* Check for buffer overflow: we need room to write the complete primitive * (all vertices). Otherwise, avoid writing any vertices for it */ emit(ADD(dst_reg(sol_temp), this->sol_prim_written, 1u)); emit(MUL(dst_reg(sol_temp), sol_temp, src_reg(num_verts))); emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi)); emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE)); emit(IF(BRW_PREDICATE_NORMAL)); { /* Avoid overwriting MRF 1 as it is used as URB write message header */ dst_reg mrf_reg(MRF, 2); this->current_annotation = "gen6: emit SOL vertex data"; /* For each vertex, generate code to output each varying using the * appropriate binding table entry. */ for (binding = 0; binding < num_bindings; ++binding) { unsigned char varying = prog_data->transform_feedback_bindings[binding]; /* Set up the correct destination index for this vertex */ vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX, mrf_reg, this->destination_indices); inst->sol_vertex = vertex % num_verts; /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1: * * "Prior to End of Thread with a URB_WRITE, the kernel must * ensure that all writes are complete by sending the final * write as a committed write." */ bool final_write = binding == (unsigned) num_bindings - 1 && inst->sol_vertex == num_verts - 1; /* Compute offset of this varying for the current vertex * in vertex_output */ this->current_annotation = output_reg_annotation[varying]; src_reg data(this->vertex_output); data.reladdr = ralloc(mem_ctx, src_reg); int offset = get_vertex_output_offset_for_varying(vertex, varying); emit(MOV(dst_reg(this->vertex_output_offset), offset)); memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg)); data.type = output_reg[varying].type; /* PSIZ, LAYER and VIEWPORT are packed in different channels of the * same slot, so make sure we write the appropriate channel */ if (varying == VARYING_SLOT_PSIZ) data.swizzle = BRW_SWIZZLE_WWWW; else if (varying == VARYING_SLOT_LAYER) data.swizzle = BRW_SWIZZLE_YYYY; else if (varying == VARYING_SLOT_VIEWPORT) data.swizzle = BRW_SWIZZLE_ZZZZ; else data.swizzle = prog_data->transform_feedback_swizzles[binding]; /* Write data */ inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp); inst->sol_binding = binding; inst->sol_final_write = final_write; if (final_write) { /* This is the last vertex of the primitive, then increment * SO num primitive counter and destination indices. */ emit(ADD(dst_reg(this->destination_indices), this->destination_indices, src_reg(num_verts))); emit(ADD(dst_reg(this->sol_prim_written), this->sol_prim_written, 1u)); } } this->current_annotation = NULL; } emit(BRW_OPCODE_ENDIF); }
void gen6_gs_visitor::xfb_write() { unsigned num_verts; struct brw_gs_prog_data *prog_data = (struct brw_gs_prog_data *) &c->prog_data; if (!prog_data->num_transform_feedback_bindings) return; switch (c->prog_data.output_topology) { case _3DPRIM_POINTLIST: num_verts = 1; break; case _3DPRIM_LINELIST: case _3DPRIM_LINESTRIP: case _3DPRIM_LINELOOP: num_verts = 2; break; case _3DPRIM_TRILIST: case _3DPRIM_TRIFAN: case _3DPRIM_TRISTRIP: case _3DPRIM_RECTLIST: num_verts = 3; break; case _3DPRIM_QUADLIST: case _3DPRIM_QUADSTRIP: case _3DPRIM_POLYGON: num_verts = 3; break; default: unreachable("Unexpected primitive type in Gen6 SOL program."); } this->current_annotation = "gen6 thread end: svb writes init"; emit(MOV(dst_reg(this->vertex_output_offset), 0u)); emit(MOV(dst_reg(this->sol_prim_written), 0u)); /* Check that at least one primitive can be written * * Note: since we use the binding table to keep track of buffer offsets * and stride, the GS doesn't need to keep track of a separate pointer * into each buffer; it uses a single pointer which increments by 1 for * each vertex. So we use SVBI0 for this pointer, regardless of whether * transform feedback is in interleaved or separate attribs mode. */ src_reg sol_temp(this, glsl_type::uvec4_type); emit(ADD(dst_reg(sol_temp), this->svbi, src_reg(num_verts))); /* Compare SVBI calculated number with the maximum value, which is * in R1.4 (previously saved in this->max_svbi) for gen6. */ emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE)); emit(IF(BRW_PREDICATE_NORMAL)); { src_reg destination_indices_uw = retype(destination_indices, BRW_REGISTER_TYPE_UW); vec4_instruction *inst = emit(MOV(dst_reg(destination_indices_uw), brw_imm_v(0x00020100))); /* (0, 1, 2) */ inst->force_writemask_all = true; emit(ADD(dst_reg(this->destination_indices), this->destination_indices, this->svbi)); } emit(BRW_OPCODE_ENDIF); /* Write transform feedback data for all processed vertices. */ for (int i = 0; i < c->gp->program.VerticesOut; i++) { emit(MOV(dst_reg(sol_temp), i)); emit(CMP(dst_null_d(), sol_temp, this->vertex_count, BRW_CONDITIONAL_L)); emit(IF(BRW_PREDICATE_NORMAL)); { xfb_program(i, num_verts); } emit(BRW_OPCODE_ENDIF); } }
void gen6_gs_visitor::emit_thread_end() { /* Make sure the current primitive is ended: we know it is not ended when * first_vertex is not zero. This is only relevant for outputs other than * points because in the point case we set PrimEnd on all vertices. */ if (c->gp->program.OutputType != GL_POINTS) { emit(CMP(dst_null_d(), this->first_vertex, 0u, BRW_CONDITIONAL_Z)); emit(IF(BRW_PREDICATE_NORMAL)); { visit((ir_end_primitive *) NULL); } emit(BRW_OPCODE_ENDIF); } /* Here we have to: * 1) Emit an FF_SYNC messsage to obtain an initial VUE handle. * 2) Loop over all buffered vertex data and write it to corresponding * URB entries. * 3) Allocate new VUE handles for all vertices other than the first. * 4) Send a final EOT message. */ /* MRF 0 is reserved for the debugger, so start with message header * in MRF 1. */ int base_mrf = 1; /* In the process of generating our URB write message contents, we * may need to unspill a register or load from an array. Those * reads would use MRFs 14-15. */ int max_usable_mrf = 13; /* Issue the FF_SYNC message and obtain the initial VUE handle. */ emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_G)); emit(IF(BRW_PREDICATE_NORMAL)); { this->current_annotation = "gen6 thread end: ff_sync"; vec4_instruction *inst; if (c->prog_data.gen6_xfb_enabled) { src_reg sol_temp(this, glsl_type::uvec4_type); emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES, dst_reg(this->svbi), this->vertex_count, this->prim_count, sol_temp); inst = emit(GS_OPCODE_FF_SYNC, dst_reg(this->temp), this->prim_count, this->svbi); } else { inst = emit(GS_OPCODE_FF_SYNC, dst_reg(this->temp), this->prim_count, src_reg(0u)); } inst->base_mrf = base_mrf; /* Loop over all buffered vertices and emit URB write messages */ this->current_annotation = "gen6 thread end: urb writes init"; src_reg vertex(this, glsl_type::uint_type); emit(MOV(dst_reg(vertex), 0u)); emit(MOV(dst_reg(this->vertex_output_offset), 0u)); this->current_annotation = "gen6 thread end: urb writes"; emit(BRW_OPCODE_DO); { emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE)); inst = emit(BRW_OPCODE_BREAK); inst->predicate = BRW_PREDICATE_NORMAL; /* First we prepare the message header */ emit_urb_write_header(base_mrf); /* Then add vertex data to the message in interleaved fashion */ int slot = 0; bool complete = false; do { int mrf = base_mrf + 1; /* URB offset is in URB row increments, and each of our MRFs is half * of one of those, since we're doing interleaved writes. */ int urb_offset = slot / 2; for (; slot < prog_data->vue_map.num_slots; ++slot) { int varying = prog_data->vue_map.slot_to_varying[slot]; current_annotation = output_reg_annotation[varying]; /* Compute offset of this slot for the current vertex * in vertex_output */ src_reg data(this->vertex_output); data.reladdr = ralloc(mem_ctx, src_reg); memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg)); /* Copy this slot to the appropriate message register */ dst_reg reg = dst_reg(MRF, mrf); reg.type = output_reg[varying].type; data.type = reg.type; vec4_instruction *inst = emit(MOV(reg, data)); inst->force_writemask_all = true; mrf++; emit(ADD(dst_reg(this->vertex_output_offset), this->vertex_output_offset, 1u)); /* If this was max_usable_mrf, we can't fit anything more into * this URB WRITE. */ if (mrf > max_usable_mrf) { slot++; break; } } complete = slot >= prog_data->vue_map.num_slots; emit_urb_write_opcode(complete, base_mrf, mrf, urb_offset); } while (!complete); /* Skip over the flags data item so that vertex_output_offset points * to the first data item of the next vertex, so that we can start * writing the next vertex. */ emit(ADD(dst_reg(this->vertex_output_offset), this->vertex_output_offset, 1u)); emit(ADD(dst_reg(vertex), vertex, 1u)); } emit(BRW_OPCODE_WHILE); if (c->prog_data.gen6_xfb_enabled) xfb_write(); } emit(BRW_OPCODE_ENDIF); /* Finally, emit EOT message. * * In gen6 we need to end the thread differently depending on whether we have * emitted at least one vertex or not. In case we did, the EOT message must * always include the COMPLETE flag or else the GPU hangs. If we have not * produced any output we can't use the COMPLETE flag. * * However, this would lead us to end the program with an ENDIF opcode, * which we want to avoid, so what we do is that we always request a new * VUE handle every time we do a URB WRITE, even for the last vertex we emit. * With this we make sure that whether we have emitted at least one vertex * or none at all, we have to finish the thread without writing to the URB, * which works for both cases by setting the COMPLETE and UNUSED flags in * the EOT message. */ this->current_annotation = "gen6 thread end: EOT"; if (c->prog_data.gen6_xfb_enabled) { /* When emitting EOT, set SONumPrimsWritten Increment Value. */ src_reg data(this, glsl_type::uint_type); emit(AND(dst_reg(data), this->sol_prim_written, src_reg(0xffffu))); emit(SHL(dst_reg(data), data, src_reg(16u))); emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data); } vec4_instruction *inst = emit(GS_OPCODE_THREAD_END); inst->urb_write_flags = BRW_URB_WRITE_COMPLETE | BRW_URB_WRITE_UNUSED; inst->base_mrf = base_mrf; inst->mlen = 1; }
void gen6_gs_visitor::visit(ir_emit_vertex *) { this->current_annotation = "gen6 emit vertex"; /* Honor max_vertex layout indication in geometry shader by ignoring any * vertices coming after c->gp->program.VerticesOut. */ unsigned num_output_vertices = c->gp->program.VerticesOut; emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices), BRW_CONDITIONAL_L)); emit(IF(BRW_PREDICATE_NORMAL)); { /* Buffer all output slots for this vertex in vertex_output */ for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) { int varying = prog_data->vue_map.slot_to_varying[slot]; if (varying != VARYING_SLOT_PSIZ) { dst_reg dst(this->vertex_output); dst.reladdr = ralloc(mem_ctx, src_reg); memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg)); emit_urb_slot(dst, varying); } else { /* The PSIZ slot can pack multiple varyings in different channels * and emit_urb_slot() will produce a MOV instruction for each of * them. Since we are writing to an array, that will translate to * possibly multiple MOV instructions with an array destination and * each will generate a scratch write with the same offset into * scratch space (thus, each one overwriting the previous). This is * not what we want. What we will do instead is emit PSIZ to a * a regular temporary register, then move that resgister into the * array. This way we only have one instruction with an array * destination and we only produce a single scratch write. */ dst_reg tmp = dst_reg(src_reg(this, glsl_type::uvec4_type)); emit_urb_slot(tmp, varying); dst_reg dst(this->vertex_output); dst.reladdr = ralloc(mem_ctx, src_reg); memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg)); vec4_instruction *inst = emit(MOV(dst, src_reg(tmp))); inst->force_writemask_all = true; } emit(ADD(dst_reg(this->vertex_output_offset), this->vertex_output_offset, 1u)); } /* Now buffer flags for this vertex */ dst_reg dst(this->vertex_output); dst.reladdr = ralloc(mem_ctx, src_reg); memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg)); if (c->gp->program.OutputType == GL_POINTS) { /* If we are outputting points, then every vertex has PrimStart and * PrimEnd set. */ emit(MOV(dst, (_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) | URB_WRITE_PRIM_START | URB_WRITE_PRIM_END)); emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u)); } else { /* Otherwise, we can only set the PrimStart flag, which we have stored * in the first_vertex register. We will have to wait until we execute * EndPrimitive() or we end the thread to set the PrimEnd flag on a * vertex. */ emit(OR(dst, this->first_vertex, (c->prog_data.output_topology << URB_WRITE_PRIM_TYPE_SHIFT))); emit(MOV(dst_reg(this->first_vertex), 0u)); } emit(ADD(dst_reg(this->vertex_output_offset), this->vertex_output_offset, 1u)); /* Update vertex count */ emit(ADD(dst_reg(this->vertex_count), this->vertex_count, 1u)); } emit(BRW_OPCODE_ENDIF); }