void vec4_gs_visitor::gs_end_primitive() { /* We can only do EndPrimitive() functionality when the control data * consists of cut bits. Fortunately, the only time it isn't is when the * output type is points, in which case EndPrimitive() is a no-op. */ if (gs_prog_data->control_data_format != GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) { return; } if (c->control_data_header_size_bits == 0) return; /* Cut bits use one bit per vertex. */ assert(c->control_data_bits_per_vertex == 1); /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting * vertex n, 0 otherwise. So all we need to do here is mark bit * (vertex_count - 1) % 32 in the cut_bits register to indicate that * EndPrimitive() was called after emitting vertex (vertex_count - 1); * vec4_gs_visitor::emit_control_data_bits() will take care of the rest. * * Note that if EndPrimitve() is called before emitting any vertices, this * will cause us to set bit 31 of the control_data_bits register to 1. * That's fine because: * * - If max_vertices < 32, then vertex number 31 (zero-based) will never be * output, so the hardware will ignore cut bit 31. * * - If max_vertices == 32, then vertex number 31 is guaranteed to be the * last vertex, so setting cut bit 31 has no effect (since the primitive * is automatically ended when the GS terminates). * * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the * control_data_bits register to 0 when the first vertex is emitted. */ /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */ src_reg one(this, glsl_type::uint_type); emit(MOV(dst_reg(one), brw_imm_ud(1u))); src_reg prev_count(this, glsl_type::uint_type); emit(ADD(dst_reg(prev_count), this->vertex_count, brw_imm_ud(0xffffffffu))); src_reg mask(this, glsl_type::uint_type); /* Note: we're relying on the fact that the GEN SHL instruction only pays * attention to the lower 5 bits of its second source argument, so on this * architecture, 1 << (vertex_count - 1) is equivalent to 1 << * ((vertex_count - 1) % 32). */ emit(SHL(dst_reg(mask), one, prev_count)); emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask)); }
/** * Write out a batch of 32 control data bits from the control_data_bits * register to the URB. * * The current value of the vertex_count register determines which DWORD in * the URB receives the control data bits. The control_data_bits register is * assumed to contain the correct data for the vertex that was most recently * output, and all previous vertices that share the same DWORD. * * This function takes care of ensuring that if no vertices have been output * yet, no control bits are emitted. */ void vec4_gs_visitor::emit_control_data_bits() { assert(c->control_data_bits_per_vertex != 0); /* Since the URB_WRITE_OWORD message operates with 128-bit (vec4 sized) * granularity, we need to use two tricks to ensure that the batch of 32 * control data bits is written to the appropriate DWORD in the URB. To * select which vec4 we are writing to, we use the "slot {0,1} offset" * fields of the message header. To select which DWORD in the vec4 we are * writing to, we use the channel mask fields of the message header. To * avoid penalizing geometry shaders that emit a small number of vertices * with extra bookkeeping, we only do each of these tricks when * c->prog_data.control_data_header_size_bits is large enough to make it * necessary. * * Note: this means that if we're outputting just a single DWORD of control * data bits, we'll actually replicate it four times since we won't do any * channel masking. But that's not a problem since in this case the * hardware only pays attention to the first DWORD. */ enum brw_urb_write_flags urb_write_flags = BRW_URB_WRITE_OWORD; if (c->control_data_header_size_bits > 32) urb_write_flags = urb_write_flags | BRW_URB_WRITE_USE_CHANNEL_MASKS; if (c->control_data_header_size_bits > 128) urb_write_flags = urb_write_flags | BRW_URB_WRITE_PER_SLOT_OFFSET; /* If vertex_count is 0, then no control data bits have been accumulated * yet, so we should do nothing. */ emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_NEQ)); emit(IF(BRW_PREDICATE_NORMAL)); { /* If we are using either channel masks or a per-slot offset, then we * need to figure out which DWORD we are trying to write to, using the * formula: * * dword_index = (vertex_count - 1) * bits_per_vertex / 32 * * Since bits_per_vertex is a power of two, and is known at compile * time, this can be optimized to: * * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex)) */ src_reg dword_index(this, glsl_type::uint_type); if (urb_write_flags) { src_reg prev_count(this, glsl_type::uint_type); emit(ADD(dst_reg(prev_count), this->vertex_count, 0xffffffffu)); unsigned log2_bits_per_vertex = _mesa_fls(c->control_data_bits_per_vertex); emit(SHR(dst_reg(dword_index), prev_count, (uint32_t) (6 - log2_bits_per_vertex))); } /* Start building the URB write message. The first MRF gets a copy of * R0. */ int base_mrf = 1; dst_reg mrf_reg(MRF, base_mrf); src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); vec4_instruction *inst = emit(MOV(mrf_reg, r0)); inst->force_writemask_all = true; if (urb_write_flags & BRW_URB_WRITE_PER_SLOT_OFFSET) { /* Set the per-slot offset to dword_index / 4, to that we'll write to * the appropriate OWORD within the control data header. */ src_reg per_slot_offset(this, glsl_type::uint_type); emit(SHR(dst_reg(per_slot_offset), dword_index, 2u)); emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset, 1u); } if (urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS) { /* Set the channel masks to 1 << (dword_index % 4), so that we'll * write to the appropriate DWORD within the OWORD. We need to do * this computation with force_writemask_all, otherwise garbage data * from invocation 0 might clobber the mask for invocation 1 when * GS_OPCODE_PREPARE_CHANNEL_MASKS tries to OR the two masks * together. */ src_reg channel(this, glsl_type::uint_type); inst = emit(AND(dst_reg(channel), dword_index, 3u)); inst->force_writemask_all = true; src_reg one(this, glsl_type::uint_type); inst = emit(MOV(dst_reg(one), 1u)); inst->force_writemask_all = true; src_reg channel_mask(this, glsl_type::uint_type); inst = emit(SHL(dst_reg(channel_mask), one, channel)); inst->force_writemask_all = true; emit(GS_OPCODE_PREPARE_CHANNEL_MASKS, dst_reg(channel_mask), channel_mask); emit(GS_OPCODE_SET_CHANNEL_MASKS, mrf_reg, channel_mask); } /* Store the control data bits in the message payload and send it. */ dst_reg mrf_reg2(MRF, base_mrf + 1); inst = emit(MOV(mrf_reg2, this->control_data_bits)); inst->force_writemask_all = true; inst = emit(GS_OPCODE_URB_WRITE); inst->urb_write_flags = urb_write_flags; /* We need to increment Global Offset by 256-bits to make room for * Broadwell's extra "Vertex Count" payload at the beginning of the * URB entry. Since this is an OWord message, Global Offset is counted * in 128-bit units, so we must set it to 2. */ if (brw->gen >= 8) inst->offset = 2; inst->base_mrf = base_mrf; inst->mlen = 2; } emit(BRW_OPCODE_ENDIF); }