void vec4_tes_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) { const struct brw_tes_prog_data *tes_prog_data = (const struct brw_tes_prog_data *) prog_data; switch (instr->intrinsic) { case nir_intrinsic_load_tess_coord: /* gl_TessCoord is part of the payload in g1 channels 0-2 and 4-6. */ emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F), src_reg(brw_vec8_grf(1, 0)))); break; case nir_intrinsic_load_tess_level_outer: if (tes_prog_data->domain == BRW_TESS_DOMAIN_ISOLINE) { emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F), swizzle(src_reg(ATTR, 1, glsl_type::vec4_type), BRW_SWIZZLE_ZWZW))); } else { emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F), swizzle(src_reg(ATTR, 1, glsl_type::vec4_type), BRW_SWIZZLE_WZYX))); } break; case nir_intrinsic_load_tess_level_inner: if (tes_prog_data->domain == BRW_TESS_DOMAIN_QUAD) { emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F), swizzle(src_reg(ATTR, 0, glsl_type::vec4_type), BRW_SWIZZLE_WZYX))); } else { emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F), src_reg(ATTR, 1, glsl_type::float_type))); } break; case nir_intrinsic_load_primitive_id: emit(TES_OPCODE_GET_PRIMITIVE_ID, get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD)); break; case nir_intrinsic_load_input: case nir_intrinsic_load_per_vertex_input: { src_reg indirect_offset = get_indirect_offset(instr); unsigned imm_offset = instr->const_index[0]; src_reg header = input_read_header; bool is_64bit = nir_dest_bit_size(instr->dest) == 64; unsigned first_component = nir_intrinsic_component(instr); if (is_64bit) first_component /= 2; if (indirect_offset.file != BAD_FILE) { header = src_reg(this, glsl_type::uvec4_type); emit(TES_OPCODE_ADD_INDIRECT_URB_OFFSET, dst_reg(header), input_read_header, indirect_offset); } else { /* Arbitrarily only push up to 24 vec4 slots worth of data, * which is 12 registers (since each holds 2 vec4 slots). */ const unsigned max_push_slots = 24; if (imm_offset < max_push_slots) { const glsl_type *src_glsl_type = is_64bit ? glsl_type::dvec4_type : glsl_type::ivec4_type; src_reg src = src_reg(ATTR, imm_offset, src_glsl_type); src.swizzle = BRW_SWZ_COMP_INPUT(first_component); const brw_reg_type dst_reg_type = is_64bit ? BRW_REGISTER_TYPE_DF : BRW_REGISTER_TYPE_D; emit(MOV(get_nir_dest(instr->dest, dst_reg_type), src)); prog_data->urb_read_length = MAX2(prog_data->urb_read_length, DIV_ROUND_UP(imm_offset + (is_64bit ? 2 : 1), 2)); break; } } if (!is_64bit) { dst_reg temp(this, glsl_type::ivec4_type); vec4_instruction *read = emit(VEC4_OPCODE_URB_READ, temp, src_reg(header)); read->offset = imm_offset; read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET; src_reg src = src_reg(temp); src.swizzle = BRW_SWZ_COMP_INPUT(first_component); /* Copy to target. We might end up with some funky writemasks landing * in here, but we really don't want them in the above pseudo-ops. */ dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D); dst.writemask = brw_writemask_for_size(instr->num_components); emit(MOV(dst, src)); } else { /* For 64-bit we need to load twice as many 32-bit components, and for * dvec3/4 we need to emit 2 URB Read messages */ dst_reg temp(this, glsl_type::dvec4_type); dst_reg temp_d = retype(temp, BRW_REGISTER_TYPE_D); vec4_instruction *read = emit(VEC4_OPCODE_URB_READ, temp_d, src_reg(header)); read->offset = imm_offset; read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET; if (instr->num_components > 2) { read = emit(VEC4_OPCODE_URB_READ, byte_offset(temp_d, REG_SIZE), src_reg(header)); read->offset = imm_offset + 1; read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET; } src_reg temp_as_src = src_reg(temp); temp_as_src.swizzle = BRW_SWZ_COMP_INPUT(first_component); dst_reg shuffled(this, glsl_type::dvec4_type); shuffle_64bit_data(shuffled, temp_as_src, false); dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_DF); dst.writemask = brw_writemask_for_size(instr->num_components); emit(MOV(dst, src_reg(shuffled))); } break; } default: vec4_visitor::nir_emit_intrinsic(instr); } }
void vec4_vs_visitor::emit_thread_end() { setup_uniform_clipplane_values(); /* Lower legacy ff and ClipVertex clipping to clip distances */ if (key->nr_userclip_plane_consts > 0) { current_annotation = "user clip distances"; output_reg[VARYING_SLOT_CLIP_DIST0][0] = dst_reg(this, glsl_type::vec4_type); output_reg[VARYING_SLOT_CLIP_DIST1][0] = dst_reg(this, glsl_type::vec4_type); output_num_components[VARYING_SLOT_CLIP_DIST0][0] = 4; output_num_components[VARYING_SLOT_CLIP_DIST1][0] = 4; emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0][0], 0); emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1][0], 4); } /* For VS, we always end the thread by emitting a single vertex. * emit_urb_write_opcode() will take care of setting the eot flag on the * SEND instruction. */ emit_vertex(); }
void vec4_tcs_visitor::emit_output_urb_read(const dst_reg &dst, unsigned base_offset, unsigned first_component, const src_reg &indirect_offset) { vec4_instruction *inst; /* Set up the message header to reference the proper parts of the URB */ dst_reg header = dst_reg(this, glsl_type::uvec4_type); inst = emit(TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, header, brw_imm_ud(dst.writemask << first_component), indirect_offset); inst->force_writemask_all = true; vec4_instruction *read = emit(VEC4_OPCODE_URB_READ, dst, src_reg(header)); read->offset = base_offset; read->mlen = 1; read->base_mrf = -1; if (first_component) { /* Read into a temporary and copy with a swizzle and writemask. */ read->dst = retype(dst_reg(this, glsl_type::ivec4_type), dst.type); emit(MOV(dst, swizzle(src_reg(read->dst), BRW_SWZ_COMP_INPUT(first_component)))); } }
void vec4_gs_visitor::gs_end_primitive() { /* We can only do EndPrimitive() functionality when the control data * consists of cut bits. Fortunately, the only time it isn't is when the * output type is points, in which case EndPrimitive() is a no-op. */ if (gs_prog_data->control_data_format != GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) { return; } if (c->control_data_header_size_bits == 0) return; /* Cut bits use one bit per vertex. */ assert(c->control_data_bits_per_vertex == 1); /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting * vertex n, 0 otherwise. So all we need to do here is mark bit * (vertex_count - 1) % 32 in the cut_bits register to indicate that * EndPrimitive() was called after emitting vertex (vertex_count - 1); * vec4_gs_visitor::emit_control_data_bits() will take care of the rest. * * Note that if EndPrimitve() is called before emitting any vertices, this * will cause us to set bit 31 of the control_data_bits register to 1. * That's fine because: * * - If max_vertices < 32, then vertex number 31 (zero-based) will never be * output, so the hardware will ignore cut bit 31. * * - If max_vertices == 32, then vertex number 31 is guaranteed to be the * last vertex, so setting cut bit 31 has no effect (since the primitive * is automatically ended when the GS terminates). * * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the * control_data_bits register to 0 when the first vertex is emitted. */ /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */ src_reg one(this, glsl_type::uint_type); emit(MOV(dst_reg(one), brw_imm_ud(1u))); src_reg prev_count(this, glsl_type::uint_type); emit(ADD(dst_reg(prev_count), this->vertex_count, brw_imm_ud(0xffffffffu))); src_reg mask(this, glsl_type::uint_type); /* Note: we're relying on the fact that the GEN SHL instruction only pays * attention to the lower 5 bits of its second source argument, so on this * architecture, 1 << (vertex_count - 1) is equivalent to 1 << * ((vertex_count - 1) % 32). */ emit(SHL(dst_reg(mask), one, prev_count)); emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask)); }
void gen6_gs_visitor::visit(ir_end_primitive *) { this->current_annotation = "gen6 end primitive"; /* Calling EndPrimitive() is optional for point output. In this case we set * the PrimEnd flag when we process EmitVertex(). */ if (c->gp->program.OutputType == GL_POINTS) return; /* Otherwise we know that the last vertex we have processed was the last * vertex in the primitive and we need to set its PrimEnd flag, so do this * unless we haven't emitted that vertex at all (vertex_count != 0). * * Notice that we have already incremented vertex_count when we processed * the last emit_vertex, so we need to take that into account in the * comparison below (hence the num_output_vertices + 1 in the comparison * below). */ unsigned num_output_vertices = c->gp->program.VerticesOut; emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices + 1), BRW_CONDITIONAL_L)); vec4_instruction *inst = emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_NEQ)); inst->predicate = BRW_PREDICATE_NORMAL; emit(IF(BRW_PREDICATE_NORMAL)); { /* vertex_output_offset is already pointing at the first entry of the * next vertex. So subtract 1 to modify the flags for the previous * vertex. */ src_reg offset(this, glsl_type::uint_type); emit(ADD(dst_reg(offset), this->vertex_output_offset, src_reg(-1))); src_reg dst(this->vertex_output); dst.reladdr = ralloc(mem_ctx, src_reg); memcpy(dst.reladdr, &offset, sizeof(src_reg)); emit(OR(dst_reg(dst), dst, URB_WRITE_PRIM_END)); emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u)); /* Set the first vertex flag to indicate that the next vertex will start * a primitive. */ emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START)); } emit(BRW_OPCODE_ENDIF); }
void vec4_tcs_visitor::emit_input_urb_read(const dst_reg &dst, const src_reg &vertex_index, unsigned base_offset, unsigned first_component, const src_reg &indirect_offset) { vec4_instruction *inst; dst_reg temp(this, glsl_type::ivec4_type); temp.type = dst.type; /* Set up the message header to reference the proper parts of the URB */ dst_reg header = dst_reg(this, glsl_type::uvec4_type); inst = emit(TCS_OPCODE_SET_INPUT_URB_OFFSETS, header, vertex_index, indirect_offset); inst->force_writemask_all = true; /* Read into a temporary, ignoring writemasking. */ inst = emit(VEC4_OPCODE_URB_READ, temp, src_reg(header)); inst->offset = base_offset; inst->mlen = 1; inst->base_mrf = -1; /* Copy the temporary to the destination to deal with writemasking. * * Also attempt to deal with gl_PointSize being in the .w component. */ if (inst->offset == 0 && indirect_offset.file == BAD_FILE) { emit(MOV(dst, swizzle(src_reg(temp), BRW_SWIZZLE_WWWW))); } else { src_reg src = src_reg(temp); src.swizzle = BRW_SWZ_COMP_INPUT(first_component); emit(MOV(dst, src)); } }
void vec4_tes_visitor::emit_prolog() { input_read_header = src_reg(this, glsl_type::uvec4_type); emit(TES_OPCODE_CREATE_INPUT_READ_HEADER, dst_reg(input_read_header)); this->current_annotation = NULL; }
void vec4_tcs_visitor::emit_thread_end() { vec4_instruction *inst; current_annotation = "thread end"; if (nir->info->tcs.vertices_out % 2) { emit(BRW_OPCODE_ENDIF); } if (devinfo->gen == 7) { struct brw_tcs_prog_data *tcs_prog_data = (struct brw_tcs_prog_data *) prog_data; current_annotation = "release input vertices"; /* Synchronize all threads, so we know that no one is still * using the input URB handles. */ if (tcs_prog_data->instances > 1) { dst_reg header = dst_reg(this, glsl_type::uvec4_type); emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header); emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header)); } /* Make thread 0 (invocations <1, 0>) release pairs of ICP handles. * We want to compare the bottom half of invocation_id with 0, but * use that truth value for the top half as well. Unfortunately, * we don't have stride in the vec4 world, nor UV immediates in * align16, so we need an opcode to get invocation_id<0,4,0>. */ set_condmod(BRW_CONDITIONAL_Z, emit(TCS_OPCODE_SRC0_010_IS_ZERO, dst_null_d(), invocation_id)); emit(IF(BRW_PREDICATE_NORMAL)); for (unsigned i = 0; i < key->input_vertices; i += 2) { /* If we have an odd number of input vertices, the last will be * unpaired. We don't want to use an interleaved URB write in * that case. */ const bool is_unpaired = i == key->input_vertices - 1; dst_reg header(this, glsl_type::uvec4_type); emit(TCS_OPCODE_RELEASE_INPUT, header, brw_imm_ud(i), brw_imm_ud(is_unpaired)); } emit(BRW_OPCODE_ENDIF); } if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME)) emit_shader_time_end(); inst = emit(TCS_OPCODE_THREAD_END); inst->base_mrf = 14; inst->mlen = 2; }
void vec4_gs_visitor::emit_prolog() { /* In vertex shaders, r0.2 is guaranteed to be initialized to zero. In * geometry shaders, it isn't (it contains a bunch of information we don't * need, like the input primitive type). We need r0.2 to be zero in order * to build scratch read/write messages correctly (otherwise this value * will be interpreted as a global offset, causing us to do our scratch * reads/writes to garbage memory). So just set it to zero at the top of * the shader. */ this->current_annotation = "clear r0.2"; dst_reg r0(retype(brw_vec4_grf(0, 0), BRW_REGISTER_TYPE_UD)); vec4_instruction *inst = emit(GS_OPCODE_SET_DWORD_2, r0, brw_imm_ud(0u)); inst->force_writemask_all = true; /* Create a virtual register to hold the vertex count */ this->vertex_count = src_reg(this, glsl_type::uint_type); /* Initialize the vertex_count register to 0 */ this->current_annotation = "initialize vertex_count"; inst = emit(MOV(dst_reg(this->vertex_count), brw_imm_ud(0u))); inst->force_writemask_all = true; if (c->control_data_header_size_bits > 0) { /* Create a virtual register to hold the current set of control data * bits. */ this->control_data_bits = src_reg(this, glsl_type::uint_type); /* If we're outputting more than 32 control data bits, then EmitVertex() * will set control_data_bits to 0 after emitting the first vertex. * Otherwise, we need to initialize it to 0 here. */ if (c->control_data_header_size_bits <= 32) { this->current_annotation = "initialize control data bits"; inst = emit(MOV(dst_reg(this->control_data_bits), brw_imm_ud(0u))); inst->force_writemask_all = true; } } this->current_annotation = NULL; }
static struct brw_fp_dst get_temp( struct brw_wm_compile *c ) { int bit = ffs( ~c->fp_temp ); if (!bit) { debug_printf("%s: out of temporaries\n", __FILE__); } c->fp_temp |= 1<<(bit-1); return dst_reg(TGSI_FILE_TEMPORARY, c->fp_first_internal_temp+(bit-1)); }
static struct prog_dst_register get_temp( struct brw_wm_compile *c ) { int bit = _mesa_ffs( ~c->fp_temp ); if (!bit) { printf("%s: out of temporaries\n", __FILE__); exit(1); } c->fp_temp |= 1<<(bit-1); return dst_reg(PROGRAM_TEMPORARY, FIRST_INTERNAL_TEMP+(bit-1)); }
void vec4_vs_visitor::setup_uniform_clipplane_values() { for (int i = 0; i < key->nr_userclip_plane_consts; ++i) { this->userplane[i] = dst_reg(UNIFORM, this->uniforms); this->userplane[i].type = BRW_REGISTER_TYPE_F; for (int j = 0; j < 4; ++j) { stage_prog_data->param[this->uniforms * 4 + j] = (gl_constant_value *) &clip_planes[i][j]; } ++this->uniforms; } }
void gen6_gs_visitor::emit_urb_write_header(int mrf) { this->current_annotation = "gen6 urb header"; /* Compute offset of the flags for the current vertex in vertex_output and * write them in dw2 of the message header. * * Notice that by the time that emit_thread_end() calls here * vertex_output_offset should point to the first data item of the current * vertex in vertex_output, thus we only need to add the number of output * slots per vertex to that offset to obtain the flags data offset. */ src_reg flags_offset(this, glsl_type::uint_type); emit(ADD(dst_reg(flags_offset), this->vertex_output_offset, src_reg(prog_data->vue_map.num_slots))); src_reg flags_data(this->vertex_output); flags_data.reladdr = ralloc(mem_ctx, src_reg); memcpy(flags_data.reladdr, &flags_offset, sizeof(src_reg)); emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, mrf), flags_data); }
void vec4_gs_visitor::set_stream_control_data_bits(unsigned stream_id) { /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */ /* Note: we are calling this *before* increasing vertex_count, so * this->vertex_count == vertex_count - 1 in the formula above. */ /* Stream mode uses 2 bits per vertex */ assert(c->control_data_bits_per_vertex == 2); /* Must be a valid stream */ assert(stream_id >= 0 && stream_id < MAX_VERTEX_STREAMS); /* Control data bits are initialized to 0 so we don't have to set any * bits when sending vertices to stream 0. */ if (stream_id == 0) return; /* reg::sid = stream_id */ src_reg sid(this, glsl_type::uint_type); emit(MOV(dst_reg(sid), stream_id)); /* reg:shift_count = 2 * (vertex_count - 1) */ src_reg shift_count(this, glsl_type::uint_type); emit(SHL(dst_reg(shift_count), this->vertex_count, 1u)); /* Note: we're relying on the fact that the GEN SHL instruction only pays * attention to the lower 5 bits of its second source argument, so on this * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to * stream_id << ((2 * (vertex_count - 1)) % 32). */ src_reg mask(this, glsl_type::uint_type); emit(SHL(dst_reg(mask), sid, shift_count)); emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask)); }
void vec4_tcs_visitor::emit_urb_write(const src_reg &value, unsigned writemask, unsigned base_offset, const src_reg &indirect_offset) { if (writemask == 0) return; src_reg message(this, glsl_type::uvec4_type, 2); vec4_instruction *inst; inst = emit(TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, dst_reg(message), brw_imm_ud(writemask), indirect_offset); inst->force_writemask_all = true; inst = emit(MOV(offset(dst_reg(retype(message, value.type)), 1), value)); inst->force_writemask_all = true; inst = emit(TCS_OPCODE_URB_WRITE, dst_null_f(), message); inst->offset = base_offset; inst->mlen = 2; inst->base_mrf = -1; }
static void fog_blend( struct brw_wm_compile *c, struct prog_src_register fog_factor ) { struct prog_dst_register outcolor = dst_reg(PROGRAM_OUTPUT, FRAG_RESULT_COLR); struct prog_src_register fogcolor = search_or_add_param5( c, STATE_FOG_COLOR, 0,0,0,0 ); /* color.xyz = LRP fog_factor.xxxx, output_color, fog_color */ emit_op(c, OPCODE_LRP, dst_mask(outcolor, WRITEMASK_XYZ), 0, 0, 0, fog_factor, src_reg_from_dst(outcolor), fogcolor); }
void vec4_tcs_visitor::emit_prolog() { invocation_id = src_reg(this, glsl_type::uint_type); emit(TCS_OPCODE_GET_INSTANCE_ID, dst_reg(invocation_id)); /* HS threads are dispatched with the dispatch mask set to 0xFF. * If there are an odd number of output vertices, then the final * HS instance dispatched will only have its bottom half doing real * work, and so we need to disable the upper half: */ if (nir->info->tcs.vertices_out % 2) { emit(CMP(dst_null_d(), invocation_id, brw_imm_ud(nir->info->tcs.vertices_out), BRW_CONDITIONAL_L)); /* Matching ENDIF is in emit_thread_end() */ emit(IF(BRW_PREDICATE_NORMAL)); } }
void vec4_tcs_visitor::emit_output_urb_read(const dst_reg &dst, unsigned base_offset, const src_reg &indirect_offset) { vec4_instruction *inst; /* Set up the message header to reference the proper parts of the URB */ dst_reg header = dst_reg(this, glsl_type::uvec4_type); inst = emit(TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, header, brw_imm_ud(dst.writemask), indirect_offset); inst->force_writemask_all = true; /* Read into a temporary, ignoring writemasking. */ vec4_instruction *read = emit(VEC4_OPCODE_URB_READ, dst, src_reg(header)); read->offset = base_offset; read->mlen = 1; read->base_mrf = -1; }
void gen6_gs_visitor::emit_urb_write_opcode(bool complete, int base_mrf, int last_mrf, int urb_offset) { vec4_instruction *inst = NULL; if (!complete) { /* If the vertex is not complete we don't have to do anything special */ inst = emit(GS_OPCODE_URB_WRITE); inst->urb_write_flags = BRW_URB_WRITE_NO_FLAGS; } else { /* Otherwise we always request to allocate a new VUE handle. If this is * the last write before the EOT message and the new handle never gets * used it will be dereferenced when we send the EOT message. This is * necessary to avoid different setups for the EOT message (one for the * case when there is no output and another for the case when there is) * which would require to end the program with an IF/ELSE/ENDIF block, * something we do not want. */ inst = emit(GS_OPCODE_URB_WRITE_ALLOCATE); inst->urb_write_flags = BRW_URB_WRITE_COMPLETE; inst->dst = dst_reg(MRF, base_mrf); inst->src[0] = this->temp; } inst->base_mrf = base_mrf; /* URB data written (does not include the message header reg) must * be a multiple of 256 bits, or 2 VS registers. See vol5c.5, * section 5.4.3.2.2: URB_INTERLEAVED. */ int mlen = last_mrf - base_mrf; if ((mlen % 2) != 1) mlen++; inst->mlen = mlen; inst->offset = urb_offset; }
void vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) { switch (instr->intrinsic) { case nir_intrinsic_load_invocation_id: emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD), invocation_id)); break; case nir_intrinsic_load_primitive_id: emit(TCS_OPCODE_GET_PRIMITIVE_ID, get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD)); break; case nir_intrinsic_load_patch_vertices_in: emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D), brw_imm_d(key->input_vertices))); break; case nir_intrinsic_load_per_vertex_input: { src_reg indirect_offset = get_indirect_offset(instr); unsigned imm_offset = instr->const_index[0]; nir_const_value *vertex_const = nir_src_as_const_value(instr->src[0]); src_reg vertex_index = vertex_const ? src_reg(brw_imm_ud(vertex_const->u32[0])) : get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1); dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D); dst.writemask = brw_writemask_for_size(instr->num_components); emit_input_urb_read(dst, vertex_index, imm_offset, nir_intrinsic_component(instr), indirect_offset); break; } case nir_intrinsic_load_input: unreachable("nir_lower_io should use load_per_vertex_input intrinsics"); break; case nir_intrinsic_load_output: case nir_intrinsic_load_per_vertex_output: { src_reg indirect_offset = get_indirect_offset(instr); unsigned imm_offset = instr->const_index[0]; dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D); dst.writemask = brw_writemask_for_size(instr->num_components); if (imm_offset == 0 && indirect_offset.file == BAD_FILE) { dst.type = BRW_REGISTER_TYPE_F; /* This is a read of gl_TessLevelInner[], which lives in the * Patch URB header. The layout depends on the domain. */ switch (key->tes_primitive_mode) { case GL_QUADS: { /* DWords 3-2 (reversed); use offset 0 and WZYX swizzle. */ dst_reg tmp(this, glsl_type::vec4_type); emit_output_urb_read(tmp, 0, 0, src_reg()); emit(MOV(writemask(dst, WRITEMASK_XY), swizzle(src_reg(tmp), BRW_SWIZZLE_WZYX))); break; } case GL_TRIANGLES: /* DWord 4; use offset 1 but normal swizzle/writemask. */ emit_output_urb_read(writemask(dst, WRITEMASK_X), 1, 0, src_reg()); break; case GL_ISOLINES: /* All channels are undefined. */ return; default: unreachable("Bogus tessellation domain"); } } else if (imm_offset == 1 && indirect_offset.file == BAD_FILE) { dst.type = BRW_REGISTER_TYPE_F; unsigned swiz = BRW_SWIZZLE_WZYX; /* This is a read of gl_TessLevelOuter[], which lives in the * high 4 DWords of the Patch URB header, in reverse order. */ switch (key->tes_primitive_mode) { case GL_QUADS: dst.writemask = WRITEMASK_XYZW; break; case GL_TRIANGLES: dst.writemask = WRITEMASK_XYZ; break; case GL_ISOLINES: /* Isolines are not reversed; swizzle .zw -> .xy */ swiz = BRW_SWIZZLE_ZWZW; dst.writemask = WRITEMASK_XY; return; default: unreachable("Bogus tessellation domain"); } dst_reg tmp(this, glsl_type::vec4_type); emit_output_urb_read(tmp, 1, 0, src_reg()); emit(MOV(dst, swizzle(src_reg(tmp), swiz))); } else { emit_output_urb_read(dst, imm_offset, nir_intrinsic_component(instr), indirect_offset); } break; } case nir_intrinsic_store_output: case nir_intrinsic_store_per_vertex_output: { src_reg value = get_nir_src(instr->src[0]); unsigned mask = instr->const_index[1]; unsigned swiz = BRW_SWIZZLE_XYZW; src_reg indirect_offset = get_indirect_offset(instr); unsigned imm_offset = instr->const_index[0]; /* The passthrough shader writes the whole patch header as two vec4s; * skip all the gl_TessLevelInner/Outer swizzling. */ if (indirect_offset.file == BAD_FILE && !is_passthrough_shader) { if (imm_offset == 0) { value.type = BRW_REGISTER_TYPE_F; mask &= (1 << tesslevel_inner_components(key->tes_primitive_mode)) - 1; /* This is a write to gl_TessLevelInner[], which lives in the * Patch URB header. The layout depends on the domain. */ switch (key->tes_primitive_mode) { case GL_QUADS: /* gl_TessLevelInner[].xy lives at DWords 3-2 (reversed). * We use an XXYX swizzle to reverse put .xy in the .wz * channels, and use a .zw writemask. */ swiz = BRW_SWIZZLE4(0, 0, 1, 0); mask = writemask_for_backwards_vector(mask); break; case GL_TRIANGLES: /* gl_TessLevelInner[].x lives at DWord 4, so we set the * writemask to X and bump the URB offset by 1. */ imm_offset = 1; break; case GL_ISOLINES: /* Skip; gl_TessLevelInner[] doesn't exist for isolines. */ return; default: unreachable("Bogus tessellation domain"); } } else if (imm_offset == 1) { value.type = BRW_REGISTER_TYPE_F; mask &= (1 << tesslevel_outer_components(key->tes_primitive_mode)) - 1; /* This is a write to gl_TessLevelOuter[] which lives in the * Patch URB Header at DWords 4-7. However, it's reversed, so * instead of .xyzw we have .wzyx. */ if (key->tes_primitive_mode == GL_ISOLINES) { /* Isolines .xy should be stored in .zw, in order. */ swiz = BRW_SWIZZLE4(0, 0, 0, 1); mask <<= 2; } else { /* Other domains are reversed; store .wzyx instead of .xyzw. */ swiz = BRW_SWIZZLE_WZYX; mask = writemask_for_backwards_vector(mask); } } } unsigned first_component = nir_intrinsic_component(instr); if (first_component) { assert(swiz == BRW_SWIZZLE_XYZW); swiz = BRW_SWZ_COMP_OUTPUT(first_component); mask = mask << first_component; } emit_urb_write(swizzle(value, swiz), mask, imm_offset, indirect_offset); break; } case nir_intrinsic_barrier: { dst_reg header = dst_reg(this, glsl_type::uvec4_type); emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header); emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header)); break; } default: vec4_visitor::nir_emit_intrinsic(instr); } }
static void emit_interp( struct brw_wm_compile *c, GLuint idx ) { struct prog_dst_register dst = dst_reg(PROGRAM_INPUT, idx); struct prog_src_register interp = src_reg(PROGRAM_PAYLOAD, idx); struct prog_src_register deltas = get_delta_xy(c); struct prog_src_register arg2; GLuint opcode; /* Need to use PINTERP on attributes which have been * multiplied by 1/W in the SF program, and LINTERP on those * which have not: */ switch (idx) { case FRAG_ATTRIB_WPOS: opcode = WM_LINTERP; arg2 = src_undef(); /* Have to treat wpos.xy specially: */ emit_op(c, WM_WPOSXY, dst_mask(dst, WRITEMASK_XY), 0, 0, 0, get_pixel_xy(c), src_undef(), src_undef()); dst = dst_mask(dst, WRITEMASK_ZW); /* PROGRAM_INPUT.attr.xyzw = INTERP payload.interp[attr].x, deltas.xyw */ emit_op(c, WM_LINTERP, dst, 0, 0, 0, interp, deltas, arg2); break; case FRAG_ATTRIB_COL0: case FRAG_ATTRIB_COL1: if (c->key.flat_shade) { emit_op(c, WM_CINTERP, dst, 0, 0, 0, interp, src_undef(), src_undef()); } else { emit_op(c, WM_LINTERP, dst, 0, 0, 0, interp, deltas, src_undef()); } break; default: emit_op(c, WM_PINTERP, dst, 0, 0, 0, interp, deltas, get_pixel_w(c)); break; } c->fp_interp_emitted |= 1<<idx; }
static void emit_interp( struct brw_wm_compile *c, GLuint idx ) { struct prog_dst_register dst = dst_reg(PROGRAM_INPUT, idx); struct prog_src_register interp = src_reg(PROGRAM_PAYLOAD, idx); struct prog_src_register deltas = get_delta_xy(c); /* Need to use PINTERP on attributes which have been * multiplied by 1/W in the SF program, and LINTERP on those * which have not: */ switch (idx) { case FRAG_ATTRIB_WPOS: /* Have to treat wpos.xy specially: */ emit_op(c, WM_WPOSXY, dst_mask(dst, WRITEMASK_XY), 0, get_pixel_xy(c), src_undef(), src_undef()); dst = dst_mask(dst, WRITEMASK_ZW); /* PROGRAM_INPUT.attr.xyzw = INTERP payload.interp[attr].x, deltas.xyw */ emit_op(c, WM_LINTERP, dst, 0, interp, deltas, src_undef()); break; case FRAG_ATTRIB_COL0: case FRAG_ATTRIB_COL1: if (c->key.flat_shade) { emit_op(c, WM_CINTERP, dst, 0, interp, src_undef(), src_undef()); } else { if (c->key.linear_color) { emit_op(c, WM_LINTERP, dst, 0, interp, deltas, src_undef()); } else { /* perspective-corrected color interpolation */ emit_op(c, WM_PINTERP, dst, 0, interp, deltas, get_pixel_w(c)); } } break; case FRAG_ATTRIB_FOGC: /* Interpolate the fog coordinate */ emit_op(c, WM_PINTERP, dst_mask(dst, WRITEMASK_X), 0, interp, deltas, get_pixel_w(c)); emit_op(c, OPCODE_MOV, dst_mask(dst, WRITEMASK_YZW), 0, src_swizzle(interp, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ONE), src_undef(), src_undef()); break; case FRAG_ATTRIB_FACE: emit_op(c, WM_FRONTFACING, dst_mask(dst, WRITEMASK_X), 0, src_undef(), src_undef(), src_undef()); break; case FRAG_ATTRIB_PNTC: /* XXX review/test this case */ emit_op(c, WM_PINTERP, dst_mask(dst, WRITEMASK_XY), 0, interp, deltas, get_pixel_w(c)); emit_op(c, OPCODE_MOV, dst_mask(dst, WRITEMASK_ZW), 0, src_swizzle(interp, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ONE), src_undef(), src_undef()); break; default: emit_op(c, WM_PINTERP, dst, 0, interp, deltas, get_pixel_w(c)); break; } c->fp_interp_emitted |= 1<<idx; }
static struct prog_dst_register dst_undef( void ) { return dst_reg(PROGRAM_UNDEFINED, 0); }
void vec4_gs_visitor::visit(ir_emit_vertex *ir) { this->current_annotation = "emit vertex: safety check"; /* To ensure that we don't output more vertices than the shader specified * using max_vertices, do the logic inside a conditional of the form "if * (vertex_count < MAX)" */ unsigned num_output_vertices = c->gp->program.VerticesOut; emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices), BRW_CONDITIONAL_L)); emit(IF(BRW_PREDICATE_NORMAL)); { /* If we're outputting 32 control data bits or less, then we can wait * until the shader is over to output them all. Otherwise we need to * output them as we go. Now is the time to do it, since we're about to * output the vertex_count'th vertex, so it's guaranteed that the * control data bits associated with the (vertex_count - 1)th vertex are * correct. */ if (c->control_data_header_size_bits > 32) { this->current_annotation = "emit vertex: emit control data bits"; /* Only emit control data bits if we've finished accumulating a batch * of 32 bits. This is the case when: * * (vertex_count * bits_per_vertex) % 32 == 0 * * (in other words, when the last 5 bits of vertex_count * * bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some * integer n (which is always the case, since bits_per_vertex is * always 1 or 2), this is equivalent to requiring that the last 5-n * bits of vertex_count are 0: * * vertex_count & (2^(5-n) - 1) == 0 * * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is * equivalent to: * * vertex_count & (32 / bits_per_vertex - 1) == 0 */ vec4_instruction *inst = emit(AND(dst_null_d(), this->vertex_count, (uint32_t) (32 / c->control_data_bits_per_vertex - 1))); inst->conditional_mod = BRW_CONDITIONAL_Z; emit(IF(BRW_PREDICATE_NORMAL)); { emit_control_data_bits(); /* Reset control_data_bits to 0 so we can start accumulating a new * batch. * * Note: in the case where vertex_count == 0, this neutralizes the * effect of any call to EndPrimitive() that the shader may have * made before outputting its first vertex. */ inst = emit(MOV(dst_reg(this->control_data_bits), 0u)); inst->force_writemask_all = true; } emit(BRW_OPCODE_ENDIF); } this->current_annotation = "emit vertex: vertex data"; emit_vertex(); /* In stream mode we have to set control data bits for all vertices * unless we have disabled control data bits completely (which we do * do for GL_POINTS outputs that don't use streams). */ if (c->control_data_header_size_bits > 0 && c->prog_data.control_data_format == GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) { this->current_annotation = "emit vertex: Stream control data bits"; set_stream_control_data_bits(ir->stream_id()); } this->current_annotation = "emit vertex: increment vertex count"; emit(ADD(dst_reg(this->vertex_count), this->vertex_count, src_reg(1u))); } emit(BRW_OPCODE_ENDIF); this->current_annotation = NULL; }
static void emit_interp( struct brw_wm_compile *c, GLuint idx, GLuint semantic, GLuint interp_mode ) { struct brw_fp_dst dst = dst_reg(TGSI_FILE_INPUT, idx); struct brw_fp_src interp = src_reg(BRW_FILE_PAYLOAD, idx); struct brw_fp_src deltas = get_delta_xy(c); /* Need to use PINTERP on attributes which have been * multiplied by 1/W in the SF program, and LINTERP on those * which have not: */ switch (semantic) { case TGSI_SEMANTIC_POSITION: /* Have to treat wpos.xy specially: */ emit_op1(c, WM_WPOSXY, dst_mask(dst, BRW_WRITEMASK_XY), get_pixel_xy(c)); /* TGSI_FILE_INPUT.attr.xyzw = INTERP payload.interp[attr].x, deltas.xyw */ emit_op2(c, WM_LINTERP, dst_mask(dst, BRW_WRITEMASK_ZW), interp, deltas); break; case TGSI_SEMANTIC_COLOR: if (c->key.flat_shade) { emit_op1(c, WM_CINTERP, dst, interp); } else if (interp_mode == TGSI_INTERPOLATE_LINEAR) { emit_op2(c, WM_LINTERP, dst, interp, deltas); } else { emit_op3(c, WM_PINTERP, dst, interp, deltas, get_pixel_w(c)); } break; case TGSI_SEMANTIC_FOG: /* Interpolate the fog coordinate */ emit_op3(c, WM_PINTERP, dst_mask(dst, BRW_WRITEMASK_X), interp, deltas, get_pixel_w(c)); emit_op1(c, TGSI_OPCODE_MOV, dst_mask(dst, BRW_WRITEMASK_YZ), src_imm1f(c, 0.0)); emit_op1(c, TGSI_OPCODE_MOV, dst_mask(dst, BRW_WRITEMASK_W), src_imm1f(c, 1.0)); break; case TGSI_SEMANTIC_FACE: /* XXX review/test this case */ emit_op0(c, WM_FRONTFACING, dst_mask(dst, BRW_WRITEMASK_X)); emit_op1(c, TGSI_OPCODE_MOV, dst_mask(dst, BRW_WRITEMASK_YZ), src_imm1f(c, 0.0)); emit_op1(c, TGSI_OPCODE_MOV, dst_mask(dst, BRW_WRITEMASK_W), src_imm1f(c, 1.0)); break; case TGSI_SEMANTIC_PSIZE: /* XXX review/test this case */ emit_op3(c, WM_PINTERP, dst_mask(dst, BRW_WRITEMASK_XY), interp, deltas, get_pixel_w(c)); emit_op1(c, TGSI_OPCODE_MOV, dst_mask(dst, BRW_WRITEMASK_Z), src_imm1f(c, 0.0f)); emit_op1(c, TGSI_OPCODE_MOV, dst_mask(dst, BRW_WRITEMASK_W), src_imm1f(c, 1.0f)); break; default: switch (interp_mode) { case TGSI_INTERPOLATE_CONSTANT: emit_op1(c, WM_CINTERP, dst, interp); break; case TGSI_INTERPOLATE_LINEAR: emit_op2(c, WM_LINTERP, dst, interp, deltas); break; case TGSI_INTERPOLATE_PERSPECTIVE: emit_op3(c, WM_PINTERP, dst, interp, deltas, get_pixel_w(c)); break; } break; } }
static struct brw_fp_dst dst_undef( void ) { return dst_reg(TGSI_FILE_NULL, 0); }
void vec4_gs_visitor::emit_prolog() { /* In vertex shaders, r0.2 is guaranteed to be initialized to zero. In * geometry shaders, it isn't (it contains a bunch of information we don't * need, like the input primitive type). We need r0.2 to be zero in order * to build scratch read/write messages correctly (otherwise this value * will be interpreted as a global offset, causing us to do our scratch * reads/writes to garbage memory). So just set it to zero at the top of * the shader. */ this->current_annotation = "clear r0.2"; dst_reg r0(retype(brw_vec4_grf(0, 0), BRW_REGISTER_TYPE_UD)); vec4_instruction *inst = emit(GS_OPCODE_SET_DWORD_2_IMMED, r0, 0u); inst->force_writemask_all = true; /* Create a virtual register to hold the vertex count */ this->vertex_count = src_reg(this, glsl_type::uint_type); /* Initialize the vertex_count register to 0 */ this->current_annotation = "initialize vertex_count"; inst = emit(MOV(dst_reg(this->vertex_count), 0u)); inst->force_writemask_all = true; if (c->control_data_header_size_bits > 0) { /* Create a virtual register to hold the current set of control data * bits. */ this->control_data_bits = src_reg(this, glsl_type::uint_type); /* If we're outputting more than 32 control data bits, then EmitVertex() * will set control_data_bits to 0 after emitting the first vertex. * Otherwise, we need to initialize it to 0 here. */ if (c->control_data_header_size_bits <= 32) { this->current_annotation = "initialize control data bits"; inst = emit(MOV(dst_reg(this->control_data_bits), 0u)); inst->force_writemask_all = true; } } /* If the geometry shader uses the gl_PointSize input, we need to fix it up * to account for the fact that the vertex shader stored it in the w * component of VARYING_SLOT_PSIZ. */ if (c->gp->program.Base.InputsRead & VARYING_BIT_PSIZ) { this->current_annotation = "swizzle gl_PointSize input"; for (int vertex = 0; vertex < c->gp->program.VerticesIn; vertex++) { dst_reg dst(ATTR, BRW_VARYING_SLOT_COUNT * vertex + VARYING_SLOT_PSIZ); dst.type = BRW_REGISTER_TYPE_F; src_reg src(dst); dst.writemask = WRITEMASK_X; src.swizzle = BRW_SWIZZLE_WWWW; inst = emit(MOV(dst, src)); /* In dual instanced dispatch mode, dst has a width of 4, so we need * to make sure the MOV happens regardless of which channels are * enabled. */ inst->force_writemask_all = true; } } this->current_annotation = NULL; }
void vec4_gs_visitor::gs_emit_vertex(int stream_id) { this->current_annotation = "emit vertex: safety check"; /* Haswell and later hardware ignores the "Render Stream Select" bits * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled, * and instead sends all primitives down the pipeline for rasterization. * If the SOL stage is enabled, "Render Stream Select" is honored and * primitives bound to non-zero streams are discarded after stream output. * * Since the only purpose of primives sent to non-zero streams is to * be recorded by transform feedback, we can simply discard all geometry * bound to these streams when transform feedback is disabled. */ if (stream_id > 0 && !nir->info->has_transform_feedback_varyings) return; /* If we're outputting 32 control data bits or less, then we can wait * until the shader is over to output them all. Otherwise we need to * output them as we go. Now is the time to do it, since we're about to * output the vertex_count'th vertex, so it's guaranteed that the * control data bits associated with the (vertex_count - 1)th vertex are * correct. */ if (c->control_data_header_size_bits > 32) { this->current_annotation = "emit vertex: emit control data bits"; /* Only emit control data bits if we've finished accumulating a batch * of 32 bits. This is the case when: * * (vertex_count * bits_per_vertex) % 32 == 0 * * (in other words, when the last 5 bits of vertex_count * * bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some * integer n (which is always the case, since bits_per_vertex is * always 1 or 2), this is equivalent to requiring that the last 5-n * bits of vertex_count are 0: * * vertex_count & (2^(5-n) - 1) == 0 * * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is * equivalent to: * * vertex_count & (32 / bits_per_vertex - 1) == 0 */ vec4_instruction *inst = emit(AND(dst_null_ud(), this->vertex_count, brw_imm_ud(32 / c->control_data_bits_per_vertex - 1))); inst->conditional_mod = BRW_CONDITIONAL_Z; emit(IF(BRW_PREDICATE_NORMAL)); { /* If vertex_count is 0, then no control data bits have been * accumulated yet, so we skip emitting them. */ emit(CMP(dst_null_ud(), this->vertex_count, brw_imm_ud(0u), BRW_CONDITIONAL_NEQ)); emit(IF(BRW_PREDICATE_NORMAL)); emit_control_data_bits(); emit(BRW_OPCODE_ENDIF); /* Reset control_data_bits to 0 so we can start accumulating a new * batch. * * Note: in the case where vertex_count == 0, this neutralizes the * effect of any call to EndPrimitive() that the shader may have * made before outputting its first vertex. */ inst = emit(MOV(dst_reg(this->control_data_bits), brw_imm_ud(0u))); inst->force_writemask_all = true; } emit(BRW_OPCODE_ENDIF); } this->current_annotation = "emit vertex: vertex data"; emit_vertex(); /* In stream mode we have to set control data bits for all vertices * unless we have disabled control data bits completely (which we do * do for GL_POINTS outputs that don't use streams). */ if (c->control_data_header_size_bits > 0 && gs_prog_data->control_data_format == GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) { this->current_annotation = "emit vertex: Stream control data bits"; set_stream_control_data_bits(stream_id); } this->current_annotation = NULL; }
void vec4_vs_visitor::emit_prolog() { dst_reg sign_recovery_shift; dst_reg normalize_factor; dst_reg es3_normalize_factor; for (int i = 0; i < VERT_ATTRIB_MAX; i++) { if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) { uint8_t wa_flags = key->gl_attrib_wa_flags[i]; dst_reg reg(ATTR, i); dst_reg reg_d = reg; reg_d.type = BRW_REGISTER_TYPE_D; dst_reg reg_ud = reg; reg_ud.type = BRW_REGISTER_TYPE_UD; /* Do GL_FIXED rescaling for GLES2.0. Our GL_FIXED attributes * come in as floating point conversions of the integer values. */ if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) { dst_reg dst = reg; dst.type = brw_type_for_base_type(glsl_type::vec4_type); dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1; emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f))); } /* Do sign recovery for 2101010 formats if required. */ if (wa_flags & BRW_ATTRIB_WA_SIGN) { if (sign_recovery_shift.file == BAD_FILE) { /* shift constant: <22,22,22,30> */ sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type); emit(MOV(writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u))); emit(MOV(writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u))); } emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift))); emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift))); } /* Apply BGRA swizzle if required. */ if (wa_flags & BRW_ATTRIB_WA_BGRA) { src_reg temp = src_reg(reg); temp.swizzle = BRW_SWIZZLE4(2,1,0,3); emit(MOV(reg, temp)); } if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) { /* ES 3.0 has different rules for converting signed normalized * fixed-point numbers than desktop GL. */ if ((wa_flags & BRW_ATTRIB_WA_SIGN) && !use_legacy_snorm_formula) { /* According to equation 2.2 of the ES 3.0 specification, * signed normalization conversion is done by: * * f = c / (2^(b-1)-1) */ if (es3_normalize_factor.file == BAD_FILE) { /* mul constant: 1 / (2^(b-1) - 1) */ es3_normalize_factor = dst_reg(this, glsl_type::vec4_type); emit(MOV(writemask(es3_normalize_factor, WRITEMASK_XYZ), src_reg(1.0f / ((1<<9) - 1)))); emit(MOV(writemask(es3_normalize_factor, WRITEMASK_W), src_reg(1.0f / ((1<<1) - 1)))); } dst_reg dst = reg; dst.type = brw_type_for_base_type(glsl_type::vec4_type); emit(MOV(dst, src_reg(reg_d))); emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor))); emit_minmax(BRW_CONDITIONAL_GE, dst, src_reg(dst), src_reg(-1.0f)); } else { /* The following equations are from the OpenGL 3.2 specification: * * 2.1 unsigned normalization * f = c/(2^n-1) * * 2.2 signed normalization * f = (2c+1)/(2^n-1) * * Both of these share a common divisor, which is represented by * "normalize_factor" in the code below. */ if (normalize_factor.file == BAD_FILE) { /* 1 / (2^b - 1) for b=<10,10,10,2> */ normalize_factor = dst_reg(this, glsl_type::vec4_type); emit(MOV(writemask(normalize_factor, WRITEMASK_XYZ), src_reg(1.0f / ((1<<10) - 1)))); emit(MOV(writemask(normalize_factor, WRITEMASK_W), src_reg(1.0f / ((1<<2) - 1)))); } dst_reg dst = reg; dst.type = brw_type_for_base_type(glsl_type::vec4_type); emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud))); /* For signed normalization, we want the numerator to be 2c+1. */ if (wa_flags & BRW_ATTRIB_WA_SIGN) { emit(MUL(dst, src_reg(dst), src_reg(2.0f))); emit(ADD(dst, src_reg(dst), src_reg(1.0f))); } emit(MUL(dst, src_reg(dst), src_reg(normalize_factor))); } } if (wa_flags & BRW_ATTRIB_WA_SCALE) { dst_reg dst = reg; dst.type = brw_type_for_base_type(glsl_type::vec4_type); emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud))); } } } }
/** * Write out a batch of 32 control data bits from the control_data_bits * register to the URB. * * The current value of the vertex_count register determines which DWORD in * the URB receives the control data bits. The control_data_bits register is * assumed to contain the correct data for the vertex that was most recently * output, and all previous vertices that share the same DWORD. * * This function takes care of ensuring that if no vertices have been output * yet, no control bits are emitted. */ void vec4_gs_visitor::emit_control_data_bits() { assert(c->control_data_bits_per_vertex != 0); /* Since the URB_WRITE_OWORD message operates with 128-bit (vec4 sized) * granularity, we need to use two tricks to ensure that the batch of 32 * control data bits is written to the appropriate DWORD in the URB. To * select which vec4 we are writing to, we use the "slot {0,1} offset" * fields of the message header. To select which DWORD in the vec4 we are * writing to, we use the channel mask fields of the message header. To * avoid penalizing geometry shaders that emit a small number of vertices * with extra bookkeeping, we only do each of these tricks when * c->prog_data.control_data_header_size_bits is large enough to make it * necessary. * * Note: this means that if we're outputting just a single DWORD of control * data bits, we'll actually replicate it four times since we won't do any * channel masking. But that's not a problem since in this case the * hardware only pays attention to the first DWORD. */ enum brw_urb_write_flags urb_write_flags = BRW_URB_WRITE_OWORD; if (c->control_data_header_size_bits > 32) urb_write_flags = urb_write_flags | BRW_URB_WRITE_USE_CHANNEL_MASKS; if (c->control_data_header_size_bits > 128) urb_write_flags = urb_write_flags | BRW_URB_WRITE_PER_SLOT_OFFSET; /* If vertex_count is 0, then no control data bits have been accumulated * yet, so we should do nothing. */ emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_NEQ)); emit(IF(BRW_PREDICATE_NORMAL)); { /* If we are using either channel masks or a per-slot offset, then we * need to figure out which DWORD we are trying to write to, using the * formula: * * dword_index = (vertex_count - 1) * bits_per_vertex / 32 * * Since bits_per_vertex is a power of two, and is known at compile * time, this can be optimized to: * * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex)) */ src_reg dword_index(this, glsl_type::uint_type); if (urb_write_flags) { src_reg prev_count(this, glsl_type::uint_type); emit(ADD(dst_reg(prev_count), this->vertex_count, 0xffffffffu)); unsigned log2_bits_per_vertex = _mesa_fls(c->control_data_bits_per_vertex); emit(SHR(dst_reg(dword_index), prev_count, (uint32_t) (6 - log2_bits_per_vertex))); } /* Start building the URB write message. The first MRF gets a copy of * R0. */ int base_mrf = 1; dst_reg mrf_reg(MRF, base_mrf); src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); vec4_instruction *inst = emit(MOV(mrf_reg, r0)); inst->force_writemask_all = true; if (urb_write_flags & BRW_URB_WRITE_PER_SLOT_OFFSET) { /* Set the per-slot offset to dword_index / 4, to that we'll write to * the appropriate OWORD within the control data header. */ src_reg per_slot_offset(this, glsl_type::uint_type); emit(SHR(dst_reg(per_slot_offset), dword_index, 2u)); emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset, 1u); } if (urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS) { /* Set the channel masks to 1 << (dword_index % 4), so that we'll * write to the appropriate DWORD within the OWORD. We need to do * this computation with force_writemask_all, otherwise garbage data * from invocation 0 might clobber the mask for invocation 1 when * GS_OPCODE_PREPARE_CHANNEL_MASKS tries to OR the two masks * together. */ src_reg channel(this, glsl_type::uint_type); inst = emit(AND(dst_reg(channel), dword_index, 3u)); inst->force_writemask_all = true; src_reg one(this, glsl_type::uint_type); inst = emit(MOV(dst_reg(one), 1u)); inst->force_writemask_all = true; src_reg channel_mask(this, glsl_type::uint_type); inst = emit(SHL(dst_reg(channel_mask), one, channel)); inst->force_writemask_all = true; emit(GS_OPCODE_PREPARE_CHANNEL_MASKS, dst_reg(channel_mask), channel_mask); emit(GS_OPCODE_SET_CHANNEL_MASKS, mrf_reg, channel_mask); } /* Store the control data bits in the message payload and send it. */ dst_reg mrf_reg2(MRF, base_mrf + 1); inst = emit(MOV(mrf_reg2, this->control_data_bits)); inst->force_writemask_all = true; inst = emit(GS_OPCODE_URB_WRITE); inst->urb_write_flags = urb_write_flags; /* We need to increment Global Offset by 256-bits to make room for * Broadwell's extra "Vertex Count" payload at the beginning of the * URB entry. Since this is an OWord message, Global Offset is counted * in 128-bit units, so we must set it to 2. */ if (brw->gen >= 8) inst->offset = 2; inst->base_mrf = base_mrf; inst->mlen = 2; } emit(BRW_OPCODE_ENDIF); }