void vec4_tes_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) { const struct brw_tes_prog_data *tes_prog_data = (const struct brw_tes_prog_data *) prog_data; switch (instr->intrinsic) { case nir_intrinsic_load_tess_coord: /* gl_TessCoord is part of the payload in g1 channels 0-2 and 4-6. */ emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F), src_reg(brw_vec8_grf(1, 0)))); break; case nir_intrinsic_load_tess_level_outer: if (tes_prog_data->domain == BRW_TESS_DOMAIN_ISOLINE) { emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F), swizzle(src_reg(ATTR, 1, glsl_type::vec4_type), BRW_SWIZZLE_ZWZW))); } else { emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F), swizzle(src_reg(ATTR, 1, glsl_type::vec4_type), BRW_SWIZZLE_WZYX))); } break; case nir_intrinsic_load_tess_level_inner: if (tes_prog_data->domain == BRW_TESS_DOMAIN_QUAD) { emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F), swizzle(src_reg(ATTR, 0, glsl_type::vec4_type), BRW_SWIZZLE_WZYX))); } else { emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F), src_reg(ATTR, 1, glsl_type::float_type))); } break; case nir_intrinsic_load_primitive_id: emit(TES_OPCODE_GET_PRIMITIVE_ID, get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD)); break; case nir_intrinsic_load_input: case nir_intrinsic_load_per_vertex_input: { src_reg indirect_offset = get_indirect_offset(instr); unsigned imm_offset = instr->const_index[0]; src_reg header = input_read_header; bool is_64bit = nir_dest_bit_size(instr->dest) == 64; unsigned first_component = nir_intrinsic_component(instr); if (is_64bit) first_component /= 2; if (indirect_offset.file != BAD_FILE) { header = src_reg(this, glsl_type::uvec4_type); emit(TES_OPCODE_ADD_INDIRECT_URB_OFFSET, dst_reg(header), input_read_header, indirect_offset); } else { /* Arbitrarily only push up to 24 vec4 slots worth of data, * which is 12 registers (since each holds 2 vec4 slots). */ const unsigned max_push_slots = 24; if (imm_offset < max_push_slots) { const glsl_type *src_glsl_type = is_64bit ? glsl_type::dvec4_type : glsl_type::ivec4_type; src_reg src = src_reg(ATTR, imm_offset, src_glsl_type); src.swizzle = BRW_SWZ_COMP_INPUT(first_component); const brw_reg_type dst_reg_type = is_64bit ? BRW_REGISTER_TYPE_DF : BRW_REGISTER_TYPE_D; emit(MOV(get_nir_dest(instr->dest, dst_reg_type), src)); prog_data->urb_read_length = MAX2(prog_data->urb_read_length, DIV_ROUND_UP(imm_offset + (is_64bit ? 2 : 1), 2)); break; } } if (!is_64bit) { dst_reg temp(this, glsl_type::ivec4_type); vec4_instruction *read = emit(VEC4_OPCODE_URB_READ, temp, src_reg(header)); read->offset = imm_offset; read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET; src_reg src = src_reg(temp); src.swizzle = BRW_SWZ_COMP_INPUT(first_component); /* Copy to target. We might end up with some funky writemasks landing * in here, but we really don't want them in the above pseudo-ops. */ dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D); dst.writemask = brw_writemask_for_size(instr->num_components); emit(MOV(dst, src)); } else { /* For 64-bit we need to load twice as many 32-bit components, and for * dvec3/4 we need to emit 2 URB Read messages */ dst_reg temp(this, glsl_type::dvec4_type); dst_reg temp_d = retype(temp, BRW_REGISTER_TYPE_D); vec4_instruction *read = emit(VEC4_OPCODE_URB_READ, temp_d, src_reg(header)); read->offset = imm_offset; read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET; if (instr->num_components > 2) { read = emit(VEC4_OPCODE_URB_READ, byte_offset(temp_d, REG_SIZE), src_reg(header)); read->offset = imm_offset + 1; read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET; } src_reg temp_as_src = src_reg(temp); temp_as_src.swizzle = BRW_SWZ_COMP_INPUT(first_component); dst_reg shuffled(this, glsl_type::dvec4_type); shuffle_64bit_data(shuffled, temp_as_src, false); dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_DF); dst.writemask = brw_writemask_for_size(instr->num_components); emit(MOV(dst, src_reg(shuffled))); } break; } default: vec4_visitor::nir_emit_intrinsic(instr); } }
void vec4_gs_visitor::visit(ir_emit_vertex *ir) { this->current_annotation = "emit vertex: safety check"; /* To ensure that we don't output more vertices than the shader specified * using max_vertices, do the logic inside a conditional of the form "if * (vertex_count < MAX)" */ unsigned num_output_vertices = c->gp->program.VerticesOut; emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices), BRW_CONDITIONAL_L)); emit(IF(BRW_PREDICATE_NORMAL)); { /* If we're outputting 32 control data bits or less, then we can wait * until the shader is over to output them all. Otherwise we need to * output them as we go. Now is the time to do it, since we're about to * output the vertex_count'th vertex, so it's guaranteed that the * control data bits associated with the (vertex_count - 1)th vertex are * correct. */ if (c->control_data_header_size_bits > 32) { this->current_annotation = "emit vertex: emit control data bits"; /* Only emit control data bits if we've finished accumulating a batch * of 32 bits. This is the case when: * * (vertex_count * bits_per_vertex) % 32 == 0 * * (in other words, when the last 5 bits of vertex_count * * bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some * integer n (which is always the case, since bits_per_vertex is * always 1 or 2), this is equivalent to requiring that the last 5-n * bits of vertex_count are 0: * * vertex_count & (2^(5-n) - 1) == 0 * * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is * equivalent to: * * vertex_count & (32 / bits_per_vertex - 1) == 0 */ vec4_instruction *inst = emit(AND(dst_null_d(), this->vertex_count, (uint32_t) (32 / c->control_data_bits_per_vertex - 1))); inst->conditional_mod = BRW_CONDITIONAL_Z; emit(IF(BRW_PREDICATE_NORMAL)); { emit_control_data_bits(); /* Reset control_data_bits to 0 so we can start accumulating a new * batch. * * Note: in the case where vertex_count == 0, this neutralizes the * effect of any call to EndPrimitive() that the shader may have * made before outputting its first vertex. */ inst = emit(MOV(dst_reg(this->control_data_bits), 0u)); inst->force_writemask_all = true; } emit(BRW_OPCODE_ENDIF); } this->current_annotation = "emit vertex: vertex data"; emit_vertex(); /* In stream mode we have to set control data bits for all vertices * unless we have disabled control data bits completely (which we do * do for GL_POINTS outputs that don't use streams). */ if (c->control_data_header_size_bits > 0 && c->prog_data.control_data_format == GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) { this->current_annotation = "emit vertex: Stream control data bits"; set_stream_control_data_bits(ir->stream_id()); } this->current_annotation = "emit vertex: increment vertex count"; emit(ADD(dst_reg(this->vertex_count), this->vertex_count, src_reg(1u))); } emit(BRW_OPCODE_ENDIF); this->current_annotation = NULL; }
void vec4_gs_visitor::emit_prolog() { /* In vertex shaders, r0.2 is guaranteed to be initialized to zero. In * geometry shaders, it isn't (it contains a bunch of information we don't * need, like the input primitive type). We need r0.2 to be zero in order * to build scratch read/write messages correctly (otherwise this value * will be interpreted as a global offset, causing us to do our scratch * reads/writes to garbage memory). So just set it to zero at the top of * the shader. */ this->current_annotation = "clear r0.2"; dst_reg r0(retype(brw_vec4_grf(0, 0), BRW_REGISTER_TYPE_UD)); vec4_instruction *inst = emit(GS_OPCODE_SET_DWORD_2_IMMED, r0, 0u); inst->force_writemask_all = true; /* Create a virtual register to hold the vertex count */ this->vertex_count = src_reg(this, glsl_type::uint_type); /* Initialize the vertex_count register to 0 */ this->current_annotation = "initialize vertex_count"; inst = emit(MOV(dst_reg(this->vertex_count), 0u)); inst->force_writemask_all = true; if (c->control_data_header_size_bits > 0) { /* Create a virtual register to hold the current set of control data * bits. */ this->control_data_bits = src_reg(this, glsl_type::uint_type); /* If we're outputting more than 32 control data bits, then EmitVertex() * will set control_data_bits to 0 after emitting the first vertex. * Otherwise, we need to initialize it to 0 here. */ if (c->control_data_header_size_bits <= 32) { this->current_annotation = "initialize control data bits"; inst = emit(MOV(dst_reg(this->control_data_bits), 0u)); inst->force_writemask_all = true; } } /* If the geometry shader uses the gl_PointSize input, we need to fix it up * to account for the fact that the vertex shader stored it in the w * component of VARYING_SLOT_PSIZ. */ if (c->gp->program.Base.InputsRead & VARYING_BIT_PSIZ) { this->current_annotation = "swizzle gl_PointSize input"; for (int vertex = 0; vertex < c->gp->program.VerticesIn; vertex++) { dst_reg dst(ATTR, BRW_VARYING_SLOT_COUNT * vertex + VARYING_SLOT_PSIZ); dst.type = BRW_REGISTER_TYPE_F; src_reg src(dst); dst.writemask = WRITEMASK_X; src.swizzle = BRW_SWIZZLE_WWWW; inst = emit(MOV(dst, src)); /* In dual instanced dispatch mode, dst has a width of 4, so we need * to make sure the MOV happens regardless of which channels are * enabled. */ inst->force_writemask_all = true; } } this->current_annotation = NULL; }
void vec4_gs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) { dst_reg dest; src_reg src; switch (instr->intrinsic) { case nir_intrinsic_load_per_vertex_input: { /* The EmitNoIndirectInput flag guarantees our vertex index will * be constant. We should handle indirects someday. */ nir_const_value *vertex = nir_src_as_const_value(instr->src[0]); nir_const_value *offset = nir_src_as_const_value(instr->src[1]); /* Make up a type...we have no way of knowing... */ const glsl_type *const type = glsl_type::ivec(instr->num_components); src = src_reg(ATTR, BRW_VARYING_SLOT_COUNT * vertex->u[0] + instr->const_index[0] + offset->u[0], type); dest = get_nir_dest(instr->dest, src.type); dest.writemask = brw_writemask_for_size(instr->num_components); emit(MOV(dest, src)); break; } case nir_intrinsic_load_input: unreachable("nir_lower_io should have produced per_vertex intrinsics"); case nir_intrinsic_emit_vertex_with_counter: { this->vertex_count = retype(get_nir_src(instr->src[0], 1), BRW_REGISTER_TYPE_UD); int stream_id = instr->const_index[0]; gs_emit_vertex(stream_id); break; } case nir_intrinsic_end_primitive_with_counter: this->vertex_count = retype(get_nir_src(instr->src[0], 1), BRW_REGISTER_TYPE_UD); gs_end_primitive(); break; case nir_intrinsic_set_vertex_count: this->vertex_count = retype(get_nir_src(instr->src[0], 1), BRW_REGISTER_TYPE_UD); break; case nir_intrinsic_load_primitive_id: assert(gs_prog_data->include_primitive_id); dest = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D); emit(MOV(dest, retype(brw_vec4_grf(1, 0), BRW_REGISTER_TYPE_D))); break; case nir_intrinsic_load_invocation_id: { src_reg invocation_id = src_reg(nir_system_values[SYSTEM_VALUE_INVOCATION_ID]); assert(invocation_id.file != BAD_FILE); dest = get_nir_dest(instr->dest, invocation_id.type); emit(MOV(dest, invocation_id)); break; } default: vec4_visitor::nir_emit_intrinsic(instr); } }
void vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) { switch (instr->intrinsic) { case nir_intrinsic_load_invocation_id: emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD), invocation_id)); break; case nir_intrinsic_load_primitive_id: emit(TCS_OPCODE_GET_PRIMITIVE_ID, get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD)); break; case nir_intrinsic_load_patch_vertices_in: emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D), brw_imm_d(key->input_vertices))); break; case nir_intrinsic_load_per_vertex_input: { src_reg indirect_offset = get_indirect_offset(instr); unsigned imm_offset = instr->const_index[0]; nir_const_value *vertex_const = nir_src_as_const_value(instr->src[0]); src_reg vertex_index = vertex_const ? src_reg(brw_imm_ud(vertex_const->u32[0])) : get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1); dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D); dst.writemask = brw_writemask_for_size(instr->num_components); emit_input_urb_read(dst, vertex_index, imm_offset, nir_intrinsic_component(instr), indirect_offset); break; } case nir_intrinsic_load_input: unreachable("nir_lower_io should use load_per_vertex_input intrinsics"); break; case nir_intrinsic_load_output: case nir_intrinsic_load_per_vertex_output: { src_reg indirect_offset = get_indirect_offset(instr); unsigned imm_offset = instr->const_index[0]; dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D); dst.writemask = brw_writemask_for_size(instr->num_components); if (imm_offset == 0 && indirect_offset.file == BAD_FILE) { dst.type = BRW_REGISTER_TYPE_F; /* This is a read of gl_TessLevelInner[], which lives in the * Patch URB header. The layout depends on the domain. */ switch (key->tes_primitive_mode) { case GL_QUADS: { /* DWords 3-2 (reversed); use offset 0 and WZYX swizzle. */ dst_reg tmp(this, glsl_type::vec4_type); emit_output_urb_read(tmp, 0, 0, src_reg()); emit(MOV(writemask(dst, WRITEMASK_XY), swizzle(src_reg(tmp), BRW_SWIZZLE_WZYX))); break; } case GL_TRIANGLES: /* DWord 4; use offset 1 but normal swizzle/writemask. */ emit_output_urb_read(writemask(dst, WRITEMASK_X), 1, 0, src_reg()); break; case GL_ISOLINES: /* All channels are undefined. */ return; default: unreachable("Bogus tessellation domain"); } } else if (imm_offset == 1 && indirect_offset.file == BAD_FILE) { dst.type = BRW_REGISTER_TYPE_F; unsigned swiz = BRW_SWIZZLE_WZYX; /* This is a read of gl_TessLevelOuter[], which lives in the * high 4 DWords of the Patch URB header, in reverse order. */ switch (key->tes_primitive_mode) { case GL_QUADS: dst.writemask = WRITEMASK_XYZW; break; case GL_TRIANGLES: dst.writemask = WRITEMASK_XYZ; break; case GL_ISOLINES: /* Isolines are not reversed; swizzle .zw -> .xy */ swiz = BRW_SWIZZLE_ZWZW; dst.writemask = WRITEMASK_XY; return; default: unreachable("Bogus tessellation domain"); } dst_reg tmp(this, glsl_type::vec4_type); emit_output_urb_read(tmp, 1, 0, src_reg()); emit(MOV(dst, swizzle(src_reg(tmp), swiz))); } else { emit_output_urb_read(dst, imm_offset, nir_intrinsic_component(instr), indirect_offset); } break; } case nir_intrinsic_store_output: case nir_intrinsic_store_per_vertex_output: { src_reg value = get_nir_src(instr->src[0]); unsigned mask = instr->const_index[1]; unsigned swiz = BRW_SWIZZLE_XYZW; src_reg indirect_offset = get_indirect_offset(instr); unsigned imm_offset = instr->const_index[0]; /* The passthrough shader writes the whole patch header as two vec4s; * skip all the gl_TessLevelInner/Outer swizzling. */ if (indirect_offset.file == BAD_FILE && !is_passthrough_shader) { if (imm_offset == 0) { value.type = BRW_REGISTER_TYPE_F; mask &= (1 << tesslevel_inner_components(key->tes_primitive_mode)) - 1; /* This is a write to gl_TessLevelInner[], which lives in the * Patch URB header. The layout depends on the domain. */ switch (key->tes_primitive_mode) { case GL_QUADS: /* gl_TessLevelInner[].xy lives at DWords 3-2 (reversed). * We use an XXYX swizzle to reverse put .xy in the .wz * channels, and use a .zw writemask. */ swiz = BRW_SWIZZLE4(0, 0, 1, 0); mask = writemask_for_backwards_vector(mask); break; case GL_TRIANGLES: /* gl_TessLevelInner[].x lives at DWord 4, so we set the * writemask to X and bump the URB offset by 1. */ imm_offset = 1; break; case GL_ISOLINES: /* Skip; gl_TessLevelInner[] doesn't exist for isolines. */ return; default: unreachable("Bogus tessellation domain"); } } else if (imm_offset == 1) { value.type = BRW_REGISTER_TYPE_F; mask &= (1 << tesslevel_outer_components(key->tes_primitive_mode)) - 1; /* This is a write to gl_TessLevelOuter[] which lives in the * Patch URB Header at DWords 4-7. However, it's reversed, so * instead of .xyzw we have .wzyx. */ if (key->tes_primitive_mode == GL_ISOLINES) { /* Isolines .xy should be stored in .zw, in order. */ swiz = BRW_SWIZZLE4(0, 0, 0, 1); mask <<= 2; } else { /* Other domains are reversed; store .wzyx instead of .xyzw. */ swiz = BRW_SWIZZLE_WZYX; mask = writemask_for_backwards_vector(mask); } } } unsigned first_component = nir_intrinsic_component(instr); if (first_component) { assert(swiz == BRW_SWIZZLE_XYZW); swiz = BRW_SWZ_COMP_OUTPUT(first_component); mask = mask << first_component; } emit_urb_write(swizzle(value, swiz), mask, imm_offset, indirect_offset); break; } case nir_intrinsic_barrier: { dst_reg header = dst_reg(this, glsl_type::uvec4_type); emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header); emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header)); break; } default: vec4_visitor::nir_emit_intrinsic(instr); } }
static struct prog_src_register src_undef( void ) { return src_reg(PROGRAM_UNDEFINED, 0); }
static void emit_interp( struct brw_wm_compile *c, GLuint idx ) { struct prog_dst_register dst = dst_reg(PROGRAM_INPUT, idx); struct prog_src_register interp = src_reg(PROGRAM_PAYLOAD, idx); struct prog_src_register deltas = get_delta_xy(c); struct prog_src_register arg2; GLuint opcode; /* Need to use PINTERP on attributes which have been * multiplied by 1/W in the SF program, and LINTERP on those * which have not: */ switch (idx) { case FRAG_ATTRIB_WPOS: opcode = WM_LINTERP; arg2 = src_undef(); /* Have to treat wpos.xy specially: */ emit_op(c, WM_WPOSXY, dst_mask(dst, WRITEMASK_XY), 0, 0, 0, get_pixel_xy(c), src_undef(), src_undef()); dst = dst_mask(dst, WRITEMASK_ZW); /* PROGRAM_INPUT.attr.xyzw = INTERP payload.interp[attr].x, deltas.xyw */ emit_op(c, WM_LINTERP, dst, 0, 0, 0, interp, deltas, arg2); break; case FRAG_ATTRIB_COL0: case FRAG_ATTRIB_COL1: if (c->key.flat_shade) { emit_op(c, WM_CINTERP, dst, 0, 0, 0, interp, src_undef(), src_undef()); } else { emit_op(c, WM_LINTERP, dst, 0, 0, 0, interp, deltas, src_undef()); } break; default: emit_op(c, WM_PINTERP, dst, 0, 0, 0, interp, deltas, get_pixel_w(c)); break; } c->fp_interp_emitted |= 1<<idx; }
static struct prog_src_register src_reg_from_dst(struct prog_dst_register dst) { return src_reg(dst.File, dst.Index); }
void gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts) { struct brw_gs_prog_data *prog_data = (struct brw_gs_prog_data *) &c->prog_data; unsigned binding; unsigned num_bindings = prog_data->num_transform_feedback_bindings; src_reg sol_temp(this, glsl_type::uvec4_type); /* Check for buffer overflow: we need room to write the complete primitive * (all vertices). Otherwise, avoid writing any vertices for it */ emit(ADD(dst_reg(sol_temp), this->sol_prim_written, 1u)); emit(MUL(dst_reg(sol_temp), sol_temp, src_reg(num_verts))); emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi)); emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE)); emit(IF(BRW_PREDICATE_NORMAL)); { /* Avoid overwriting MRF 1 as it is used as URB write message header */ dst_reg mrf_reg(MRF, 2); this->current_annotation = "gen6: emit SOL vertex data"; /* For each vertex, generate code to output each varying using the * appropriate binding table entry. */ for (binding = 0; binding < num_bindings; ++binding) { unsigned char varying = prog_data->transform_feedback_bindings[binding]; /* Set up the correct destination index for this vertex */ vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX, mrf_reg, this->destination_indices); inst->sol_vertex = vertex % num_verts; /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1: * * "Prior to End of Thread with a URB_WRITE, the kernel must * ensure that all writes are complete by sending the final * write as a committed write." */ bool final_write = binding == (unsigned) num_bindings - 1 && inst->sol_vertex == num_verts - 1; /* Compute offset of this varying for the current vertex * in vertex_output */ this->current_annotation = output_reg_annotation[varying]; src_reg data(this->vertex_output); data.reladdr = ralloc(mem_ctx, src_reg); int offset = get_vertex_output_offset_for_varying(vertex, varying); emit(MOV(dst_reg(this->vertex_output_offset), offset)); memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg)); data.type = output_reg[varying].type; /* PSIZ, LAYER and VIEWPORT are packed in different channels of the * same slot, so make sure we write the appropriate channel */ if (varying == VARYING_SLOT_PSIZ) data.swizzle = BRW_SWIZZLE_WWWW; else if (varying == VARYING_SLOT_LAYER) data.swizzle = BRW_SWIZZLE_YYYY; else if (varying == VARYING_SLOT_VIEWPORT) data.swizzle = BRW_SWIZZLE_ZZZZ; else data.swizzle = prog_data->transform_feedback_swizzles[binding]; /* Write data */ inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp); inst->sol_binding = binding; inst->sol_final_write = final_write; if (final_write) { /* This is the last vertex of the primitive, then increment * SO num primitive counter and destination indices. */ emit(ADD(dst_reg(this->destination_indices), this->destination_indices, src_reg(num_verts))); emit(ADD(dst_reg(this->sol_prim_written), this->sol_prim_written, 1u)); } } this->current_annotation = NULL; } emit(BRW_OPCODE_ENDIF); }
void gen6_gs_visitor::xfb_write() { unsigned num_verts; struct brw_gs_prog_data *prog_data = (struct brw_gs_prog_data *) &c->prog_data; if (!prog_data->num_transform_feedback_bindings) return; switch (c->prog_data.output_topology) { case _3DPRIM_POINTLIST: num_verts = 1; break; case _3DPRIM_LINELIST: case _3DPRIM_LINESTRIP: case _3DPRIM_LINELOOP: num_verts = 2; break; case _3DPRIM_TRILIST: case _3DPRIM_TRIFAN: case _3DPRIM_TRISTRIP: case _3DPRIM_RECTLIST: num_verts = 3; break; case _3DPRIM_QUADLIST: case _3DPRIM_QUADSTRIP: case _3DPRIM_POLYGON: num_verts = 3; break; default: unreachable("Unexpected primitive type in Gen6 SOL program."); } this->current_annotation = "gen6 thread end: svb writes init"; emit(MOV(dst_reg(this->vertex_output_offset), 0u)); emit(MOV(dst_reg(this->sol_prim_written), 0u)); /* Check that at least one primitive can be written * * Note: since we use the binding table to keep track of buffer offsets * and stride, the GS doesn't need to keep track of a separate pointer * into each buffer; it uses a single pointer which increments by 1 for * each vertex. So we use SVBI0 for this pointer, regardless of whether * transform feedback is in interleaved or separate attribs mode. */ src_reg sol_temp(this, glsl_type::uvec4_type); emit(ADD(dst_reg(sol_temp), this->svbi, src_reg(num_verts))); /* Compare SVBI calculated number with the maximum value, which is * in R1.4 (previously saved in this->max_svbi) for gen6. */ emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE)); emit(IF(BRW_PREDICATE_NORMAL)); { src_reg destination_indices_uw = retype(destination_indices, BRW_REGISTER_TYPE_UW); vec4_instruction *inst = emit(MOV(dst_reg(destination_indices_uw), brw_imm_v(0x00020100))); /* (0, 1, 2) */ inst->force_writemask_all = true; emit(ADD(dst_reg(this->destination_indices), this->destination_indices, this->svbi)); } emit(BRW_OPCODE_ENDIF); /* Write transform feedback data for all processed vertices. */ for (int i = 0; i < c->gp->program.VerticesOut; i++) { emit(MOV(dst_reg(sol_temp), i)); emit(CMP(dst_null_d(), sol_temp, this->vertex_count, BRW_CONDITIONAL_L)); emit(IF(BRW_PREDICATE_NORMAL)); { xfb_program(i, num_verts); } emit(BRW_OPCODE_ENDIF); } }
void gen6_gs_visitor::emit_prolog() { vec4_gs_visitor::emit_prolog(); /* Gen6 geometry shaders require to allocate an initial VUE handle via * FF_SYNC message, however the documentation remarks that only one thread * can write to the URB simultaneously and the FF_SYNC message provides the * synchronization mechanism for this, so using this message effectively * stalls the thread until it is its turn to write to the URB. Because of * this, the best way to implement geometry shader algorithms in gen6 is to * execute the algorithm before the FF_SYNC message to maximize parallelism. * * To achieve this we buffer the geometry shader outputs for each emitted * vertex in vertex_output during operation. Then, when we have processed * the last vertex (that is, at thread end time), we send the FF_SYNC * message to allocate the initial VUE handle and write all buffered vertex * data to the URB in one go. * * For each emitted vertex, vertex_output will hold vue_map.num_slots * data items plus one additional item to hold required flags * (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message) * which come right after the data items for that vertex. Vertex data and * flags for the next vertex come right after the data items and flags for * the previous vertex. */ this->current_annotation = "gen6 prolog"; this->vertex_output = src_reg(this, glsl_type::uint_type, (prog_data->vue_map.num_slots + 1) * c->gp->program.VerticesOut); this->vertex_output_offset = src_reg(this, glsl_type::uint_type); emit(MOV(dst_reg(this->vertex_output_offset), src_reg(0u))); /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES), * so initialize it once to R0. */ vec4_instruction *inst = emit(MOV(dst_reg(MRF, 1), retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD))); inst->force_writemask_all = true; /* This will be used as a temporary to store writeback data of FF_SYNC * and URB_WRITE messages. */ this->temp = src_reg(this, glsl_type::uint_type); /* This will be used to know when we are processing the first vertex of * a primitive. We will set this to URB_WRITE_PRIM_START only when we know * that we are processing the first vertex in the primitive and to zero * otherwise. This way we can use its value directly in the URB write * headers. */ this->first_vertex = src_reg(this, glsl_type::uint_type); emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START)); /* The FF_SYNC message requires to know the number of primitives generated, * so keep a counter for this. */ this->prim_count = src_reg(this, glsl_type::uint_type); emit(MOV(dst_reg(this->prim_count), 0u)); if (c->prog_data.gen6_xfb_enabled) { /* Create a virtual register to hold destination indices in SOL */ this->destination_indices = src_reg(this, glsl_type::uvec4_type); /* Create a virtual register to hold number of written primitives */ this->sol_prim_written = src_reg(this, glsl_type::uint_type); /* Create a virtual register to hold Streamed Vertex Buffer Indices */ this->svbi = src_reg(this, glsl_type::uvec4_type); /* Create a virtual register to hold max values of SVBI */ this->max_svbi = src_reg(this, glsl_type::uvec4_type); emit(MOV(dst_reg(this->max_svbi), src_reg(retype(brw_vec1_grf(1, 4), BRW_REGISTER_TYPE_UD)))); xfb_setup(); } /* PrimitveID is delivered in r0.1 of the thread payload. If the program * needs it we have to move it to a separate register where we can map * the atttribute. * * Notice that we cannot use a virtual register for this, because we need to * map all input attributes to hardware registers in setup_payload(), * which happens before virtual registers are mapped to hardware registers. * We could work around that issue if we were able to compute the first * non-payload register here and move the PrimitiveID information to that * register, but we can't because at this point we don't know the final * number uniforms that will be included in the payload. * * So, what we do is to place PrimitiveID information in r1, which is always * delivered as part of the payload, but its only populated with data * relevant for transform feedback when we set GEN6_GS_SVBI_PAYLOAD_ENABLE * in the 3DSTATE_GS state packet. That information can be obtained by other * means though, so we can safely use r1 for this purpose. */ if (c->prog_data.include_primitive_id) { this->primitive_id = src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id)); } }
void gen6_gs_visitor::emit_thread_end() { /* Make sure the current primitive is ended: we know it is not ended when * first_vertex is not zero. This is only relevant for outputs other than * points because in the point case we set PrimEnd on all vertices. */ if (c->gp->program.OutputType != GL_POINTS) { emit(CMP(dst_null_d(), this->first_vertex, 0u, BRW_CONDITIONAL_Z)); emit(IF(BRW_PREDICATE_NORMAL)); { visit((ir_end_primitive *) NULL); } emit(BRW_OPCODE_ENDIF); } /* Here we have to: * 1) Emit an FF_SYNC messsage to obtain an initial VUE handle. * 2) Loop over all buffered vertex data and write it to corresponding * URB entries. * 3) Allocate new VUE handles for all vertices other than the first. * 4) Send a final EOT message. */ /* MRF 0 is reserved for the debugger, so start with message header * in MRF 1. */ int base_mrf = 1; /* In the process of generating our URB write message contents, we * may need to unspill a register or load from an array. Those * reads would use MRFs 14-15. */ int max_usable_mrf = 13; /* Issue the FF_SYNC message and obtain the initial VUE handle. */ emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_G)); emit(IF(BRW_PREDICATE_NORMAL)); { this->current_annotation = "gen6 thread end: ff_sync"; vec4_instruction *inst; if (c->prog_data.gen6_xfb_enabled) { src_reg sol_temp(this, glsl_type::uvec4_type); emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES, dst_reg(this->svbi), this->vertex_count, this->prim_count, sol_temp); inst = emit(GS_OPCODE_FF_SYNC, dst_reg(this->temp), this->prim_count, this->svbi); } else { inst = emit(GS_OPCODE_FF_SYNC, dst_reg(this->temp), this->prim_count, src_reg(0u)); } inst->base_mrf = base_mrf; /* Loop over all buffered vertices and emit URB write messages */ this->current_annotation = "gen6 thread end: urb writes init"; src_reg vertex(this, glsl_type::uint_type); emit(MOV(dst_reg(vertex), 0u)); emit(MOV(dst_reg(this->vertex_output_offset), 0u)); this->current_annotation = "gen6 thread end: urb writes"; emit(BRW_OPCODE_DO); { emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE)); inst = emit(BRW_OPCODE_BREAK); inst->predicate = BRW_PREDICATE_NORMAL; /* First we prepare the message header */ emit_urb_write_header(base_mrf); /* Then add vertex data to the message in interleaved fashion */ int slot = 0; bool complete = false; do { int mrf = base_mrf + 1; /* URB offset is in URB row increments, and each of our MRFs is half * of one of those, since we're doing interleaved writes. */ int urb_offset = slot / 2; for (; slot < prog_data->vue_map.num_slots; ++slot) { int varying = prog_data->vue_map.slot_to_varying[slot]; current_annotation = output_reg_annotation[varying]; /* Compute offset of this slot for the current vertex * in vertex_output */ src_reg data(this->vertex_output); data.reladdr = ralloc(mem_ctx, src_reg); memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg)); /* Copy this slot to the appropriate message register */ dst_reg reg = dst_reg(MRF, mrf); reg.type = output_reg[varying].type; data.type = reg.type; vec4_instruction *inst = emit(MOV(reg, data)); inst->force_writemask_all = true; mrf++; emit(ADD(dst_reg(this->vertex_output_offset), this->vertex_output_offset, 1u)); /* If this was max_usable_mrf, we can't fit anything more into * this URB WRITE. */ if (mrf > max_usable_mrf) { slot++; break; } } complete = slot >= prog_data->vue_map.num_slots; emit_urb_write_opcode(complete, base_mrf, mrf, urb_offset); } while (!complete); /* Skip over the flags data item so that vertex_output_offset points * to the first data item of the next vertex, so that we can start * writing the next vertex. */ emit(ADD(dst_reg(this->vertex_output_offset), this->vertex_output_offset, 1u)); emit(ADD(dst_reg(vertex), vertex, 1u)); } emit(BRW_OPCODE_WHILE); if (c->prog_data.gen6_xfb_enabled) xfb_write(); } emit(BRW_OPCODE_ENDIF); /* Finally, emit EOT message. * * In gen6 we need to end the thread differently depending on whether we have * emitted at least one vertex or not. In case we did, the EOT message must * always include the COMPLETE flag or else the GPU hangs. If we have not * produced any output we can't use the COMPLETE flag. * * However, this would lead us to end the program with an ENDIF opcode, * which we want to avoid, so what we do is that we always request a new * VUE handle every time we do a URB WRITE, even for the last vertex we emit. * With this we make sure that whether we have emitted at least one vertex * or none at all, we have to finish the thread without writing to the URB, * which works for both cases by setting the COMPLETE and UNUSED flags in * the EOT message. */ this->current_annotation = "gen6 thread end: EOT"; if (c->prog_data.gen6_xfb_enabled) { /* When emitting EOT, set SONumPrimsWritten Increment Value. */ src_reg data(this, glsl_type::uint_type); emit(AND(dst_reg(data), this->sol_prim_written, src_reg(0xffffu))); emit(SHL(dst_reg(data), data, src_reg(16u))); emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data); } vec4_instruction *inst = emit(GS_OPCODE_THREAD_END); inst->urb_write_flags = BRW_URB_WRITE_COMPLETE | BRW_URB_WRITE_UNUSED; inst->base_mrf = base_mrf; inst->mlen = 1; }
void gen6_gs_visitor::visit(ir_emit_vertex *) { this->current_annotation = "gen6 emit vertex"; /* Honor max_vertex layout indication in geometry shader by ignoring any * vertices coming after c->gp->program.VerticesOut. */ unsigned num_output_vertices = c->gp->program.VerticesOut; emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices), BRW_CONDITIONAL_L)); emit(IF(BRW_PREDICATE_NORMAL)); { /* Buffer all output slots for this vertex in vertex_output */ for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) { int varying = prog_data->vue_map.slot_to_varying[slot]; if (varying != VARYING_SLOT_PSIZ) { dst_reg dst(this->vertex_output); dst.reladdr = ralloc(mem_ctx, src_reg); memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg)); emit_urb_slot(dst, varying); } else { /* The PSIZ slot can pack multiple varyings in different channels * and emit_urb_slot() will produce a MOV instruction for each of * them. Since we are writing to an array, that will translate to * possibly multiple MOV instructions with an array destination and * each will generate a scratch write with the same offset into * scratch space (thus, each one overwriting the previous). This is * not what we want. What we will do instead is emit PSIZ to a * a regular temporary register, then move that resgister into the * array. This way we only have one instruction with an array * destination and we only produce a single scratch write. */ dst_reg tmp = dst_reg(src_reg(this, glsl_type::uvec4_type)); emit_urb_slot(tmp, varying); dst_reg dst(this->vertex_output); dst.reladdr = ralloc(mem_ctx, src_reg); memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg)); vec4_instruction *inst = emit(MOV(dst, src_reg(tmp))); inst->force_writemask_all = true; } emit(ADD(dst_reg(this->vertex_output_offset), this->vertex_output_offset, 1u)); } /* Now buffer flags for this vertex */ dst_reg dst(this->vertex_output); dst.reladdr = ralloc(mem_ctx, src_reg); memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg)); if (c->gp->program.OutputType == GL_POINTS) { /* If we are outputting points, then every vertex has PrimStart and * PrimEnd set. */ emit(MOV(dst, (_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) | URB_WRITE_PRIM_START | URB_WRITE_PRIM_END)); emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u)); } else { /* Otherwise, we can only set the PrimStart flag, which we have stored * in the first_vertex register. We will have to wait until we execute * EndPrimitive() or we end the thread to set the PrimEnd flag on a * vertex. */ emit(OR(dst, this->first_vertex, (c->prog_data.output_topology << URB_WRITE_PRIM_TYPE_SHIFT))); emit(MOV(dst_reg(this->first_vertex), 0u)); } emit(ADD(dst_reg(this->vertex_output_offset), this->vertex_output_offset, 1u)); /* Update vertex count */ emit(ADD(dst_reg(this->vertex_count), this->vertex_count, 1u)); } emit(BRW_OPCODE_ENDIF); }