static bool run_tests(struct brw_context *brw) { bool fail = false; for (int i = 0; i < ARRAY_SIZE(tests); i++) { for (int align_16 = 0; align_16 <= 1; align_16++) { struct brw_compile *p = rzalloc(NULL, struct brw_compile); brw_init_compile(brw, p, p); brw_set_predicate_control(p, BRW_PREDICATE_NONE); if (align_16) brw_set_access_mode(p, BRW_ALIGN_16); else brw_set_access_mode(p, BRW_ALIGN_1); tests[i].func(p); assert(p->nr_insn == 1); if (!test_compact_instruction(p, p->store[0])) { fail = true; continue; } if (!test_fuzz_compact_instruction(p, p->store[0])) { fail = true; continue; } ralloc_free(p); } } return fail; }
void vec4_generator::generate_gs_set_vertex_count(struct brw_reg dst, struct brw_reg src) { brw_push_insn_state(p); brw_set_access_mode(p, BRW_ALIGN_1); brw_set_mask_control(p, BRW_MASK_DISABLE); /* If we think of the src and dst registers as composed of 8 DWORDs each, * we want to pick up the contents of DWORDs 0 and 4 from src, truncate * them to WORDs, and then pack them into DWORD 2 of dst. * * It's easier to get the EU to do this if we think of the src and dst * registers as composed of 16 WORDS each; then, we want to pick up the * contents of WORDs 0 and 8 from src, and pack them into WORDs 4 and 5 of * dst. * * We can do that by the following EU instruction: * * mov (2) dst.4<1>:uw src<8;1,0>:uw { Align1, Q1, NoMask } */ brw_MOV(p, suboffset(stride(retype(dst, BRW_REGISTER_TYPE_UW), 2, 2, 1), 4), stride(retype(src, BRW_REGISTER_TYPE_UW), 8, 1, 0)); brw_set_access_mode(p, BRW_ALIGN_16); brw_pop_insn_state(p); }
void vec4_generator::generate_gs_set_write_offset(struct brw_reg dst, struct brw_reg src0, struct brw_reg src1) { /* From p22 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message * Header: M0.3): * * Slot 0 Offset. This field, after adding to the Global Offset field * in the message descriptor, specifies the offset (in 256-bit units) * from the start of the URB entry, as referenced by URB Handle 0, at * which the data will be accessed. * * Similar text describes DWORD M0.4, which is slot 1 offset. * * Therefore, we want to multiply DWORDs 0 and 4 of src0 (the x components * of the register for geometry shader invocations 0 and 1) by the * immediate value in src1, and store the result in DWORDs 3 and 4 of dst. * * We can do this with the following EU instruction: * * mul(2) dst.3<1>UD src0<8;2,4>UD src1 { Align1 WE_all } */ brw_push_insn_state(p); brw_set_access_mode(p, BRW_ALIGN_1); brw_set_mask_control(p, BRW_MASK_DISABLE); brw_MUL(p, suboffset(stride(dst, 2, 2, 1), 3), stride(src0, 8, 2, 4), src1); brw_set_access_mode(p, BRW_ALIGN_16); brw_pop_insn_state(p); }
void vec4_generator::generate_gs_set_dword_2_immed(struct brw_reg dst, struct brw_reg src) { assert(src.file == BRW_IMMEDIATE_VALUE); brw_push_insn_state(p); brw_set_access_mode(p, BRW_ALIGN_1); brw_set_mask_control(p, BRW_MASK_DISABLE); brw_MOV(p, suboffset(vec1(dst), 2), src); brw_set_access_mode(p, BRW_ALIGN_16); brw_pop_insn_state(p); }
/* Project 'pos' to screen space (or back again), overwrite with results: */ static void brw_clip_project_position(struct brw_clip_compile *c, struct brw_reg pos ) { struct brw_compile *p = &c->func; /* calc rhw */ brw_math_invert(p, get_element(pos, W), get_element(pos, W)); /* value.xyz *= value.rhw */ brw_set_access_mode(p, BRW_ALIGN_16); brw_MUL(p, brw_writemask(pos, WRITEMASK_XYZ), pos, brw_swizzle1(pos, W)); brw_set_access_mode(p, BRW_ALIGN_1); }
/* This is performed against the original triangles, so no indirection * required: BZZZT! */ static void compute_tri_direction( struct brw_clip_compile *c ) { struct brw_compile *p = &c->func; struct brw_reg e = c->reg.tmp0; struct brw_reg f = c->reg.tmp1; GLuint hpos_offset = brw_vert_result_to_offset(&c->vue_map, VARYING_SLOT_POS); struct brw_reg v0 = byte_offset(c->reg.vertex[0], hpos_offset); struct brw_reg v1 = byte_offset(c->reg.vertex[1], hpos_offset); struct brw_reg v2 = byte_offset(c->reg.vertex[2], hpos_offset); struct brw_reg v0n = get_tmp(c); struct brw_reg v1n = get_tmp(c); struct brw_reg v2n = get_tmp(c); /* Convert to NDC. * NOTE: We can't modify the original vertex coordinates, * as it may impact further operations. * So, we have to keep normalized coordinates in temp registers. * * TBD-KC * Try to optimize unnecessary MOV's. */ brw_MOV(p, v0n, v0); brw_MOV(p, v1n, v1); brw_MOV(p, v2n, v2); brw_clip_project_position(c, v0n); brw_clip_project_position(c, v1n); brw_clip_project_position(c, v2n); /* Calculate the vectors of two edges of the triangle: */ brw_ADD(p, e, v0n, negate(v2n)); brw_ADD(p, f, v1n, negate(v2n)); /* Take their crossproduct: */ brw_set_access_mode(p, BRW_ALIGN_16); brw_MUL(p, vec4(brw_null_reg()), brw_swizzle(e, 1,2,0,3), brw_swizzle(f,2,0,1,3)); brw_MAC(p, vec4(e), negate(brw_swizzle(e, 2,0,1,3)), brw_swizzle(f,1,2,0,3)); brw_set_access_mode(p, BRW_ALIGN_1); brw_MUL(p, c->reg.dir, c->reg.dir, vec4(e)); }
void vec4_generator::generate_math2_gen6(vec4_instruction *inst, struct brw_reg dst, struct brw_reg src0, struct brw_reg src1) { /* Can't do writemask because math can't be align16. */ assert(dst.dw1.bits.writemask == WRITEMASK_XYZW); /* Source swizzles are ignored. */ check_gen6_math_src_arg(src0); check_gen6_math_src_arg(src1); brw_set_access_mode(p, BRW_ALIGN_1); brw_math2(p, dst, brw_math_function(inst->opcode), src0, src1); brw_set_access_mode(p, BRW_ALIGN_16); }
void vec4_generator::generate_math1_gen6(vec4_instruction *inst, struct brw_reg dst, struct brw_reg src) { /* Can't do writemask because math can't be align16. */ assert(dst.dw1.bits.writemask == WRITEMASK_XYZW); check_gen6_math_src_arg(src); brw_set_access_mode(p, BRW_ALIGN_1); brw_math(p, dst, brw_math_function(inst->opcode), inst->base_mrf, src, BRW_MATH_DATA_SCALAR, BRW_MATH_PRECISION_FULL); brw_set_access_mode(p, BRW_ALIGN_16); }
/* This is performed against the original triangles, so no indirection * required: BZZZT! */ static void compute_tri_direction( struct brw_clip_compile *c ) { struct brw_compile *p = &c->func; struct brw_reg e = c->reg.tmp0; struct brw_reg f = c->reg.tmp1; struct brw_reg v0 = byte_offset(c->reg.vertex[0], c->offset[VERT_RESULT_HPOS]); struct brw_reg v1 = byte_offset(c->reg.vertex[1], c->offset[VERT_RESULT_HPOS]); struct brw_reg v2 = byte_offset(c->reg.vertex[2], c->offset[VERT_RESULT_HPOS]); /* Calculate the vectors of two edges of the triangle: */ brw_ADD(p, e, v0, negate(v2)); brw_ADD(p, f, v1, negate(v2)); /* Take their crossproduct: */ brw_set_access_mode(p, BRW_ALIGN_16); brw_MUL(p, vec4(brw_null_reg()), brw_swizzle(e, 1,2,0,3), brw_swizzle(f,2,0,1,3)); brw_MAC(p, vec4(e), negate(brw_swizzle(e, 2,0,1,3)), brw_swizzle(f,1,2,0,3)); brw_set_access_mode(p, BRW_ALIGN_1); brw_MUL(p, c->reg.dir, c->reg.dir, vec4(e)); }
void vec4_generator::generate_gs_prepare_channel_masks(struct brw_reg dst) { /* We want to left shift just DWORD 4 (the x component belonging to the * second geometry shader invocation) by 4 bits. So generate the * instruction: * * shl(1) dst.4<1>UD dst.4<0,1,0>UD 4UD { align1 WE_all } */ dst = suboffset(vec1(dst), 4); brw_push_insn_state(p); brw_set_access_mode(p, BRW_ALIGN_1); brw_set_mask_control(p, BRW_MASK_DISABLE); brw_SHL(p, dst, dst, brw_imm_ud(4)); brw_pop_insn_state(p); }
void vec4_generator::generate_gs_get_instance_id(struct brw_reg dst) { /* We want to right shift R0.0 & R0.1 by GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT * and store into dst.0 & dst.4. So generate the instruction: * * shr(8) dst<1> R0<1,4,0> GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT { align1 WE_normal 1Q } */ brw_push_insn_state(p); brw_set_access_mode(p, BRW_ALIGN_1); dst = retype(dst, BRW_REGISTER_TYPE_UD); struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); brw_SHR(p, dst, stride(r0, 1, 4, 0), brw_imm_ud(GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT)); brw_pop_insn_state(p); }
void vec4_generator::generate_unpack_flags(vec4_instruction *inst, struct brw_reg dst) { brw_push_insn_state(p); brw_set_mask_control(p, BRW_MASK_DISABLE); brw_set_access_mode(p, BRW_ALIGN_1); struct brw_reg flags = brw_flag_reg(0, 0); struct brw_reg dst_0 = suboffset(vec1(dst), 0); struct brw_reg dst_4 = suboffset(vec1(dst), 4); brw_AND(p, dst_0, flags, brw_imm_ud(0x0f)); brw_AND(p, dst_4, flags, brw_imm_ud(0xf0)); brw_SHR(p, dst_4, dst_4, brw_imm_ud(4)); brw_pop_insn_state(p); }
void vec4_generator::generate_oword_dual_block_offsets(struct brw_reg m1, struct brw_reg index) { int second_vertex_offset; if (brw->gen >= 6) second_vertex_offset = 1; else second_vertex_offset = 16; m1 = retype(m1, BRW_REGISTER_TYPE_D); /* Set up M1 (message payload). Only the block offsets in M1.0 and * M1.4 are used, and the rest are ignored. */ struct brw_reg m1_0 = suboffset(vec1(m1), 0); struct brw_reg m1_4 = suboffset(vec1(m1), 4); struct brw_reg index_0 = suboffset(vec1(index), 0); struct brw_reg index_4 = suboffset(vec1(index), 4); brw_push_insn_state(p); brw_set_mask_control(p, BRW_MASK_DISABLE); brw_set_access_mode(p, BRW_ALIGN_1); brw_MOV(p, m1_0, index_0); if (index.file == BRW_IMMEDIATE_VALUE) { index_4.dw1.ud += second_vertex_offset; brw_MOV(p, m1_4, index_4); } else { brw_ADD(p, m1_4, index_4, brw_imm_d(second_vertex_offset)); } brw_pop_insn_state(p); }
void vec4_generator::generate_tex(vec4_instruction *inst, struct brw_reg dst, struct brw_reg src) { int msg_type = -1; if (brw->gen >= 5) { switch (inst->opcode) { case SHADER_OPCODE_TEX: case SHADER_OPCODE_TXL: if (inst->shadow_compare) { msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE; } else { msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD; } break; case SHADER_OPCODE_TXD: if (inst->shadow_compare) { /* Gen7.5+. Otherwise, lowered by brw_lower_texture_gradients(). */ assert(brw->is_haswell); msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE; } else { msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS; } break; case SHADER_OPCODE_TXF: msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD; break; case SHADER_OPCODE_TXF_CMS: if (brw->gen >= 7) msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS; else msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD; break; case SHADER_OPCODE_TXF_MCS: assert(brw->gen >= 7); msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS; break; case SHADER_OPCODE_TXS: msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO; break; case SHADER_OPCODE_TG4: if (inst->shadow_compare) { msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C; } else { msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4; } break; case SHADER_OPCODE_TG4_OFFSET: if (inst->shadow_compare) { msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C; } else { msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO; } break; default: assert(!"should not get here: invalid vec4 texture opcode"); break; } } else { switch (inst->opcode) { case SHADER_OPCODE_TEX: case SHADER_OPCODE_TXL: if (inst->shadow_compare) { msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_COMPARE; assert(inst->mlen == 3); } else { msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD; assert(inst->mlen == 2); } break; case SHADER_OPCODE_TXD: /* There is no sample_d_c message; comparisons are done manually. */ msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS; assert(inst->mlen == 4); break; case SHADER_OPCODE_TXF: msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_LD; assert(inst->mlen == 2); break; case SHADER_OPCODE_TXS: msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO; assert(inst->mlen == 2); break; default: assert(!"should not get here: invalid vec4 texture opcode"); break; } } assert(msg_type != -1); /* Load the message header if present. If there's a texture offset, we need * to set it up explicitly and load the offset bitfield. Otherwise, we can * use an implied move from g0 to the first message register. */ if (inst->header_present) { if (brw->gen < 6 && !inst->texture_offset) { /* Set up an implied move from g0 to the MRF. */ src = brw_vec8_grf(0, 0); } else { struct brw_reg header = retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD); /* Explicitly set up the message header by copying g0 to the MRF. */ brw_push_insn_state(p); brw_set_mask_control(p, BRW_MASK_DISABLE); brw_MOV(p, header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); brw_set_access_mode(p, BRW_ALIGN_1); if (inst->texture_offset) { /* Set the texel offset bits in DWord 2. */ brw_MOV(p, get_element_ud(header, 2), brw_imm_ud(inst->texture_offset)); } if (inst->sampler >= 16) { /* The "Sampler Index" field can only store values between 0 and 15. * However, we can add an offset to the "Sampler State Pointer" * field, effectively selecting a different set of 16 samplers. * * The "Sampler State Pointer" needs to be aligned to a 32-byte * offset, and each sampler state is only 16-bytes, so we can't * exclusively use the offset - we have to use both. */ assert(brw->is_haswell); /* field only exists on Haswell */ brw_ADD(p, get_element_ud(header, 3), get_element_ud(brw_vec8_grf(0, 0), 3), brw_imm_ud(16 * (inst->sampler / 16) * sizeof(gen7_sampler_state))); } brw_pop_insn_state(p); } } uint32_t return_format; switch (dst.type) { case BRW_REGISTER_TYPE_D: return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32; break; case BRW_REGISTER_TYPE_UD: return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32; break; default: return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32; break; } uint32_t surface_index = ((inst->opcode == SHADER_OPCODE_TG4 || inst->opcode == SHADER_OPCODE_TG4_OFFSET) ? prog_data->base.binding_table.gather_texture_start : prog_data->base.binding_table.texture_start) + inst->sampler; brw_SAMPLE(p, dst, inst->base_mrf, src, surface_index, inst->sampler % 16, msg_type, 1, /* response length */ inst->mlen, inst->header_present, BRW_SAMPLER_SIMD_MODE_SIMD4X2, return_format); brw_mark_surface_used(&prog_data->base, surface_index); }
void vec4_generator::generate_gs_set_channel_masks(struct brw_reg dst, struct brw_reg src) { /* From p21 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message * Header: M0.5): * * 15 Vertex 1 DATA [3] / Vertex 0 DATA[7] Channel Mask * * When Swizzle Control = URB_INTERLEAVED this bit controls Vertex 1 * DATA[3], when Swizzle Control = URB_NOSWIZZLE this bit controls * Vertex 0 DATA[7]. This bit is ANDed with the corresponding * channel enable to determine the final channel enable. For the * URB_READ_OWORD & URB_READ_HWORD messages, when final channel * enable is 1 it indicates that Vertex 1 DATA [3] will be included * in the writeback message. For the URB_WRITE_OWORD & * URB_WRITE_HWORD messages, when final channel enable is 1 it * indicates that Vertex 1 DATA [3] will be written to the surface. * * 0: Vertex 1 DATA [3] / Vertex 0 DATA[7] channel not included * 1: Vertex DATA [3] / Vertex 0 DATA[7] channel included * * 14 Vertex 1 DATA [2] Channel Mask * 13 Vertex 1 DATA [1] Channel Mask * 12 Vertex 1 DATA [0] Channel Mask * 11 Vertex 0 DATA [3] Channel Mask * 10 Vertex 0 DATA [2] Channel Mask * 9 Vertex 0 DATA [1] Channel Mask * 8 Vertex 0 DATA [0] Channel Mask * * (This is from a section of the PRM that is agnostic to the particular * type of shader being executed, so "Vertex 0" and "Vertex 1" refer to * geometry shader invocations 0 and 1, respectively). Since we have the * enable flags for geometry shader invocation 0 in bits 3:0 of DWORD 0, * and the enable flags for geometry shader invocation 1 in bits 7:0 of * DWORD 4, we just need to OR them together and store the result in bits * 15:8 of DWORD 5. * * It's easier to get the EU to do this if we think of the src and dst * registers as composed of 32 bytes each; then, we want to pick up the * contents of bytes 0 and 16 from src, OR them together, and store them in * byte 21. * * We can do that by the following EU instruction: * * or(1) dst.21<1>UB src<0,1,0>UB src.16<0,1,0>UB { align1 WE_all } * * Note: this relies on the source register having zeros in (a) bits 7:4 of * DWORD 0 and (b) bits 3:0 of DWORD 4. We can rely on (b) because the * source register was prepared by GS_OPCODE_PREPARE_CHANNEL_MASKS (which * shifts DWORD 4 left by 4 bits), and we can rely on (a) because prior to * the execution of GS_OPCODE_PREPARE_CHANNEL_MASKS, DWORDs 0 and 4 need to * contain valid channel mask values (which are in the range 0x0-0xf). */ dst = retype(dst, BRW_REGISTER_TYPE_UB); src = retype(src, BRW_REGISTER_TYPE_UB); brw_push_insn_state(p); brw_set_access_mode(p, BRW_ALIGN_1); brw_set_mask_control(p, BRW_MASK_DISABLE); brw_OR(p, suboffset(vec1(dst), 21), vec1(src), suboffset(vec1(src), 16)); brw_pop_insn_state(p); }
void vec4_generator::generate_tex(vec4_instruction *inst, struct brw_reg dst, struct brw_reg src) { int msg_type = -1; if (intel->gen >= 5) { switch (inst->opcode) { case SHADER_OPCODE_TEX: case SHADER_OPCODE_TXL: if (inst->shadow_compare) { msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE; } else { msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD; } break; case SHADER_OPCODE_TXD: /* There is no sample_d_c message; comparisons are done manually. */ msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS; break; case SHADER_OPCODE_TXF: msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD; break; case SHADER_OPCODE_TXS: msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO; break; default: assert(!"should not get here: invalid VS texture opcode"); break; } } else { switch (inst->opcode) { case SHADER_OPCODE_TEX: case SHADER_OPCODE_TXL: if (inst->shadow_compare) { msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_COMPARE; assert(inst->mlen == 3); } else { msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD; assert(inst->mlen == 2); } break; case SHADER_OPCODE_TXD: /* There is no sample_d_c message; comparisons are done manually. */ msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS; assert(inst->mlen == 4); break; case SHADER_OPCODE_TXF: msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_LD; assert(inst->mlen == 2); break; case SHADER_OPCODE_TXS: msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO; assert(inst->mlen == 2); break; default: assert(!"should not get here: invalid VS texture opcode"); break; } } assert(msg_type != -1); /* Load the message header if present. If there's a texture offset, we need * to set it up explicitly and load the offset bitfield. Otherwise, we can * use an implied move from g0 to the first message register. */ if (inst->texture_offset) { /* Explicitly set up the message header by copying g0 to the MRF. */ brw_MOV(p, retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD), retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); /* Then set the offset bits in DWord 2. */ brw_set_access_mode(p, BRW_ALIGN_1); brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, inst->base_mrf, 2), BRW_REGISTER_TYPE_UD), brw_imm_uw(inst->texture_offset)); brw_set_access_mode(p, BRW_ALIGN_16); } else if (inst->header_present) { /* Set up an implied move from g0 to the MRF. */ src = brw_vec8_grf(0, 0); } uint32_t return_format; switch (dst.type) { case BRW_REGISTER_TYPE_D: return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32; break; case BRW_REGISTER_TYPE_UD: return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32; break; default: return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32; break; } brw_SAMPLE(p, dst, inst->base_mrf, src, SURF_INDEX_VS_TEXTURE(inst->sampler), inst->sampler, WRITEMASK_XYZW, msg_type, 1, /* response length */ inst->mlen, inst->header_present, BRW_SAMPLER_SIMD_MODE_SIMD4X2, return_format); }
void brw_emit_point_sprite_setup( struct brw_sf_compile *c, GLboolean allocate) { struct brw_compile *p = &c->func; GLuint i; c->nr_verts = 1; if (allocate) alloc_regs(c); copy_z_inv_w(c); for (i = 0; i < c->nr_setup_regs; i++) { struct brw_reg a0 = offset(c->vert[0], i); GLushort pc, pc_persp, pc_linear, pc_coord_replace; GLboolean last = calculate_masks(c, i, &pc, &pc_persp, &pc_linear); pc_coord_replace = calculate_point_sprite_mask(c, i); pc_persp &= ~pc_coord_replace; if (pc_persp) { brw_set_predicate_control_flag_value(p, pc_persp); brw_MUL(p, a0, a0, c->inv_w[0]); } /* Point sprite coordinate replacement: A texcoord with this * enabled gets replaced with the value (x, y, 0, 1) where x and * y vary from 0 to 1 across the horizontal and vertical of the * point. */ if (pc_coord_replace) { brw_set_predicate_control_flag_value(p, pc_coord_replace); /* Caculate 1.0/PointWidth */ brw_math(&c->func, c->tmp, BRW_MATH_FUNCTION_INV, BRW_MATH_SATURATE_NONE, 0, c->dx0, BRW_MATH_DATA_SCALAR, BRW_MATH_PRECISION_FULL); brw_set_access_mode(p, BRW_ALIGN_16); /* dA/dx, dA/dy */ brw_MOV(p, c->m1Cx, brw_imm_f(0.0)); brw_MOV(p, c->m2Cy, brw_imm_f(0.0)); brw_MOV(p, brw_writemask(c->m1Cx, WRITEMASK_X), c->tmp); if (c->key.sprite_origin_lower_left) { brw_MOV(p, brw_writemask(c->m2Cy, WRITEMASK_Y), negate(c->tmp)); } else { brw_MOV(p, brw_writemask(c->m2Cy, WRITEMASK_Y), c->tmp); } /* attribute constant offset */ brw_MOV(p, c->m3C0, brw_imm_f(0.0)); if (c->key.sprite_origin_lower_left) { brw_MOV(p, brw_writemask(c->m3C0, WRITEMASK_YW), brw_imm_f(1.0)); } else { brw_MOV(p, brw_writemask(c->m3C0, WRITEMASK_W), brw_imm_f(1.0)); } brw_set_access_mode(p, BRW_ALIGN_1); } if (pc & ~pc_coord_replace) { brw_set_predicate_control_flag_value(p, pc & ~pc_coord_replace); brw_MOV(p, c->m1Cx, brw_imm_ud(0)); brw_MOV(p, c->m2Cy, brw_imm_ud(0)); brw_MOV(p, c->m3C0, a0); /* constant value */ } brw_set_predicate_control_flag_value(p, pc); /* Copy m0..m3 to URB. */ brw_urb_WRITE(p, brw_null_reg(), 0, brw_vec8_grf(0, 0), 0, /* allocate */ 1, /* used */ 4, /* msg len */ 0, /* response len */ last, /* eot */ last, /* writes complete */ i*4, /* urb destination offset */ BRW_URB_SWIZZLE_TRANSPOSE); } }
/** * Generate the geometry shader program used on Gen6 to perform stream output * (transform feedback). */ void gen6_sol_program(struct brw_gs_compile *c, struct brw_gs_prog_key *key, unsigned num_verts, bool check_edge_flags) { struct brw_compile *p = &c->func; c->prog_data.svbi_postincrement_value = num_verts; brw_gs_alloc_regs(c, num_verts, true); brw_gs_initialize_header(c); if (key->num_transform_feedback_bindings > 0) { unsigned vertex, binding; struct brw_reg destination_indices_uw = vec8(retype(c->reg.destination_indices, BRW_REGISTER_TYPE_UW)); /* Note: since we use the binding table to keep track of buffer offsets * and stride, the GS doesn't need to keep track of a separate pointer * into each buffer; it uses a single pointer which increments by 1 for * each vertex. So we use SVBI0 for this pointer, regardless of whether * transform feedback is in interleaved or separate attribs mode. * * Make sure that the buffers have enough room for all the vertices. */ brw_ADD(p, get_element_ud(c->reg.temp, 0), get_element_ud(c->reg.SVBI, 0), brw_imm_ud(num_verts)); brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_LE, get_element_ud(c->reg.temp, 0), get_element_ud(c->reg.SVBI, 4)); brw_IF(p, BRW_EXECUTE_1); /* Compute the destination indices to write to. Usually we use SVBI[0] * + (0, 1, 2). However, for odd-numbered triangles in tristrips, the * vertices come down the pipeline in reversed winding order, so we need * to flip the order when writing to the transform feedback buffer. To * ensure that flatshading accuracy is preserved, we need to write them * in order SVBI[0] + (0, 2, 1) if we're using the first provoking * vertex convention, and in order SVBI[0] + (1, 0, 2) if we're using * the last provoking vertex convention. * * Note: since brw_imm_v can only be used in instructions in * packed-word execution mode, and SVBI is a double-word, we need to * first move the appropriate immediate constant ((0, 1, 2), (0, 2, 1), * or (1, 0, 2)) to the destination_indices register, and then add SVBI * using a separate instruction. Also, since the immediate constant is * expressed as packed words, and we need to load double-words into * destination_indices, we need to intersperse zeros to fill the upper * halves of each double-word. */ brw_MOV(p, destination_indices_uw, brw_imm_v(0x00020100)); /* (0, 1, 2) */ if (num_verts == 3) { /* Get primitive type into temp register. */ brw_AND(p, get_element_ud(c->reg.temp, 0), get_element_ud(c->reg.R0, 2), brw_imm_ud(0x1f)); /* Test if primitive type is TRISTRIP_REVERSE. We need to do this as * an 8-wide comparison so that the conditional MOV that follows * moves all 8 words correctly. */ brw_CMP(p, vec8(brw_null_reg()), BRW_CONDITIONAL_EQ, get_element_ud(c->reg.temp, 0), brw_imm_ud(_3DPRIM_TRISTRIP_REVERSE)); /* If so, then overwrite destination_indices_uw with the appropriate * reordering. */ brw_MOV(p, destination_indices_uw, brw_imm_v(key->pv_first ? 0x00010200 /* (0, 2, 1) */ : 0x00020001)); /* (1, 0, 2) */ brw_set_predicate_control(p, BRW_PREDICATE_NONE); } brw_ADD(p, c->reg.destination_indices, c->reg.destination_indices, get_element_ud(c->reg.SVBI, 0)); /* For each vertex, generate code to output each varying using the * appropriate binding table entry. */ for (vertex = 0; vertex < num_verts; ++vertex) { /* Set up the correct destination index for this vertex */ brw_MOV(p, get_element_ud(c->reg.header, 5), get_element_ud(c->reg.destination_indices, vertex)); for (binding = 0; binding < key->num_transform_feedback_bindings; ++binding) { unsigned char varying = key->transform_feedback_bindings[binding]; unsigned char slot = c->vue_map.varying_to_slot[varying]; /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1: * * "Prior to End of Thread with a URB_WRITE, the kernel must * ensure that all writes are complete by sending the final * write as a committed write." */ bool final_write = binding == key->num_transform_feedback_bindings - 1 && vertex == num_verts - 1; struct brw_reg vertex_slot = c->reg.vertex[vertex]; vertex_slot.nr += slot / 2; vertex_slot.subnr = (slot % 2) * 16; /* gl_PointSize is stored in VARYING_SLOT_PSIZ.w. */ vertex_slot.dw1.bits.swizzle = varying == VARYING_SLOT_PSIZ ? BRW_SWIZZLE_WWWW : key->transform_feedback_swizzles[binding]; brw_set_access_mode(p, BRW_ALIGN_16); brw_MOV(p, stride(c->reg.header, 4, 4, 1), retype(vertex_slot, BRW_REGISTER_TYPE_UD)); brw_set_access_mode(p, BRW_ALIGN_1); brw_svb_write(p, final_write ? c->reg.temp : brw_null_reg(), /* dest */ 1, /* msg_reg_nr */ c->reg.header, /* src0 */ SURF_INDEX_SOL_BINDING(binding), /* binding_table_index */ final_write); /* send_commit_msg */ } } brw_ENDIF(p); /* Now, reinitialize the header register from R0 to restore the parts of * the register that we overwrote while streaming out transform feedback * data. */ brw_gs_initialize_header(c); /* Finally, wait for the write commit to occur so that we can proceed to * other things safely. * * From the Sandybridge PRM, Volume 4, Part 1, Section 3.3: * * The write commit does not modify the destination register, but * merely clears the dependency associated with the destination * register. Thus, a simple “mov” instruction using the register as a * source is sufficient to wait for the write commit to occur. */ brw_MOV(p, c->reg.temp, c->reg.temp); } brw_gs_ff_sync(c, 1); /* If RASTERIZER_DISCARD is enabled, we have nothing further to do, so * release the URB that was just allocated, and terminate the thread. */ if (key->rasterizer_discard) { brw_gs_terminate(c); return; } brw_gs_overwrite_header_dw2_from_r0(c); switch (num_verts) { case 1: brw_gs_offset_header_dw2(c, URB_WRITE_PRIM_START | URB_WRITE_PRIM_END); brw_gs_emit_vue(c, c->reg.vertex[0], true); break; case 2: brw_gs_offset_header_dw2(c, URB_WRITE_PRIM_START); brw_gs_emit_vue(c, c->reg.vertex[0], false); brw_gs_offset_header_dw2(c, URB_WRITE_PRIM_END - URB_WRITE_PRIM_START); brw_gs_emit_vue(c, c->reg.vertex[1], true); break; case 3: if (check_edge_flags) { /* Only emit vertices 0 and 1 if this is the first triangle of the * polygon. Otherwise they are redundant. */ brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ); brw_AND(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD), get_element_ud(c->reg.R0, 2), brw_imm_ud(BRW_GS_EDGE_INDICATOR_0)); brw_IF(p, BRW_EXECUTE_1); } brw_gs_offset_header_dw2(c, URB_WRITE_PRIM_START); brw_gs_emit_vue(c, c->reg.vertex[0], false); brw_gs_offset_header_dw2(c, -URB_WRITE_PRIM_START); brw_gs_emit_vue(c, c->reg.vertex[1], false); if (check_edge_flags) { brw_ENDIF(p); /* Only emit vertex 2 in PRIM_END mode if this is the last triangle * of the polygon. Otherwise leave the primitive incomplete because * there are more polygon vertices coming. */ brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ); brw_AND(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD), get_element_ud(c->reg.R0, 2), brw_imm_ud(BRW_GS_EDGE_INDICATOR_1)); brw_set_predicate_control(p, BRW_PREDICATE_NORMAL); } brw_gs_offset_header_dw2(c, URB_WRITE_PRIM_END); brw_set_predicate_control(p, BRW_PREDICATE_NONE); brw_gs_emit_vue(c, c->reg.vertex[2], true); break; } }
static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c) { #define MAX_IFSN 32 #define MAX_LOOP_DEPTH 32 struct brw_instruction *if_inst[MAX_IFSN], *loop_inst[MAX_LOOP_DEPTH]; struct brw_instruction *inst0, *inst1; int i, if_insn = 0, loop_insn = 0; struct brw_compile *p = &c->func; struct brw_indirect stack_index = brw_indirect(0, 0); c->reg_index = 0; prealloc_reg(c); brw_set_compression_control(p, BRW_COMPRESSION_NONE); brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack)); for (i = 0; i < c->nr_fp_insns; i++) { struct prog_instruction *inst = &c->prog_instructions[i]; struct prog_instruction *orig_inst; if ((orig_inst = inst->Data) != 0) orig_inst->Data = current_insn(p); if (inst->CondUpdate) brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ); else brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE); switch (inst->Opcode) { case WM_PIXELXY: emit_pixel_xy(c, inst); break; case WM_DELTAXY: emit_delta_xy(c, inst); break; case WM_PIXELW: emit_pixel_w(c, inst); break; case WM_LINTERP: emit_linterp(c, inst); break; case WM_PINTERP: emit_pinterp(c, inst); break; case WM_CINTERP: emit_cinterp(c, inst); break; case WM_WPOSXY: emit_wpos_xy(c, inst); break; case WM_FB_WRITE: emit_fb_write(c, inst); break; case OPCODE_ABS: emit_abs(c, inst); break; case OPCODE_ADD: emit_add(c, inst); break; case OPCODE_SUB: emit_sub(c, inst); break; case OPCODE_FRC: emit_frc(c, inst); break; case OPCODE_FLR: emit_flr(c, inst); break; case OPCODE_LRP: emit_lrp(c, inst); break; case OPCODE_INT: emit_int(c, inst); break; case OPCODE_MOV: emit_mov(c, inst); break; case OPCODE_DP3: emit_dp3(c, inst); break; case OPCODE_DP4: emit_dp4(c, inst); break; case OPCODE_XPD: emit_xpd(c, inst); break; case OPCODE_DPH: emit_dph(c, inst); break; case OPCODE_RCP: emit_rcp(c, inst); break; case OPCODE_RSQ: emit_rsq(c, inst); break; case OPCODE_SIN: emit_sin(c, inst); break; case OPCODE_COS: emit_cos(c, inst); break; case OPCODE_EX2: emit_ex2(c, inst); break; case OPCODE_LG2: emit_lg2(c, inst); break; case OPCODE_MAX: emit_max(c, inst); break; case OPCODE_MIN: emit_min(c, inst); break; case OPCODE_DDX: emit_ddx(c, inst); break; case OPCODE_DDY: emit_ddy(c, inst); break; case OPCODE_SLT: emit_slt(c, inst); break; case OPCODE_SLE: emit_sle(c, inst); break; case OPCODE_SGT: emit_sgt(c, inst); break; case OPCODE_SGE: emit_sge(c, inst); break; case OPCODE_SEQ: emit_seq(c, inst); break; case OPCODE_SNE: emit_sne(c, inst); break; case OPCODE_MUL: emit_mul(c, inst); break; case OPCODE_POW: emit_pow(c, inst); break; case OPCODE_MAD: emit_mad(c, inst); break; case OPCODE_TEX: emit_tex(c, inst); break; case OPCODE_TXB: emit_txb(c, inst); break; case OPCODE_KIL_NV: emit_kil(c); break; case OPCODE_IF: assert(if_insn < MAX_IFSN); if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8); break; case OPCODE_ELSE: if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]); break; case OPCODE_ENDIF: assert(if_insn > 0); brw_ENDIF(p, if_inst[--if_insn]); break; case OPCODE_BGNSUB: case OPCODE_ENDSUB: break; case OPCODE_CAL: brw_push_insn_state(p); brw_set_mask_control(p, BRW_MASK_DISABLE); brw_set_access_mode(p, BRW_ALIGN_1); brw_ADD(p, deref_1ud(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16)); brw_set_access_mode(p, BRW_ALIGN_16); brw_ADD(p, get_addr_reg(stack_index), get_addr_reg(stack_index), brw_imm_d(4)); orig_inst = inst->Data; orig_inst->Data = &p->store[p->nr_insn]; brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16)); brw_pop_insn_state(p); break; case OPCODE_RET: brw_push_insn_state(p); brw_set_mask_control(p, BRW_MASK_DISABLE); brw_ADD(p, get_addr_reg(stack_index), get_addr_reg(stack_index), brw_imm_d(-4)); brw_set_access_mode(p, BRW_ALIGN_1); brw_MOV(p, brw_ip_reg(), deref_1ud(stack_index, 0)); brw_set_access_mode(p, BRW_ALIGN_16); brw_pop_insn_state(p); break; case OPCODE_BGNLOOP: loop_inst[loop_insn++] = brw_DO(p, BRW_EXECUTE_8); break; case OPCODE_BRK: brw_BREAK(p); brw_set_predicate_control(p, BRW_PREDICATE_NONE); break; case OPCODE_CONT: brw_CONT(p); brw_set_predicate_control(p, BRW_PREDICATE_NONE); break; case OPCODE_ENDLOOP: loop_insn--; inst0 = inst1 = brw_WHILE(p, loop_inst[loop_insn]); /* patch all the BREAK instructions from last BEGINLOOP */ while (inst0 > loop_inst[loop_insn]) { inst0--; if (inst0->header.opcode == BRW_OPCODE_BREAK) { inst0->bits3.if_else.jump_count = inst1 - inst0 + 1; inst0->bits3.if_else.pop_count = 0; } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) { inst0->bits3.if_else.jump_count = inst1 - inst0; inst0->bits3.if_else.pop_count = 0; } } break; default: _mesa_printf("unsupported IR in fragment shader %d\n", inst->Opcode); } if (inst->CondUpdate) brw_set_predicate_control(p, BRW_PREDICATE_NORMAL); else brw_set_predicate_control(p, BRW_PREDICATE_NONE); } post_wm_emit(c); for (i = 0; i < c->fp->program.Base.NumInstructions; i++) c->fp->program.Base.Instructions[i].Data = NULL; }