static void gs_write_so(struct gs_compile_context *gcc, struct toy_dst dst, struct toy_src index, struct toy_src out, bool send_write_commit_message, int binding_table_index) { struct toy_compiler *tc = &gcc->tc; struct toy_dst mrf_header; struct toy_src desc; mrf_header = tdst_d(tdst(TOY_FILE_MRF, gcc->first_free_mrf, 0)); /* m0.5: destination index */ gs_COPY1(tc, mrf_header, 5, index, 0); /* m0.0 - m0.3: RGBA */ gs_COPY4(tc, mrf_header, 0, tsrc_type(out, mrf_header.type), 0); desc = tsrc_imm_mdesc_data_port(tc, false, 1, send_write_commit_message, true, send_write_commit_message, GEN6_MSG_DP_SVB_WRITE, 0, binding_table_index); tc_SEND(tc, dst, tsrc_from(mrf_header), desc, GEN6_SFID_DP_RC); }
static void vs_lower_opcode_tgsi_const_gen7(struct vs_compile_context *vcc, struct toy_dst dst, int dim, struct toy_src idx) { struct toy_compiler *tc = &vcc->tc; const struct toy_dst offset = tdst_ud(tdst(TOY_FILE_MRF, vcc->first_free_mrf, 0)); struct toy_src desc; if (vs_lower_opcode_tgsi_const_pcb(vcc, dst, dim, idx)) return; /* * In 259b65e2e7938de4aab323033cfe2b33369ddb07, pull constant load was * changed from OWord Dual Block Read to ld to increase performance in the * classic driver. Since we use the constant cache instead of the data * cache, I wonder if we still want to follow the classic driver. */ /* set offset */ tc_MOV(tc, offset, idx); desc = tsrc_imm_mdesc_sampler(tc, 1, 1, false, GEN6_MSG_SAMPLER_SIMD4X2, GEN6_MSG_SAMPLER_LD, 0, vcc->shader->bt.const_base + dim); tc_SEND(tc, dst, tsrc_from(offset), desc, GEN6_SFID_SAMPLER); }
static void vs_lower_opcode_tgsi_const_gen6(struct vs_compile_context *vcc, struct toy_dst dst, int dim, struct toy_src idx) { const struct toy_dst header = tdst_ud(tdst(TOY_FILE_MRF, vcc->first_free_mrf, 0)); const struct toy_dst block_offsets = tdst_ud(tdst(TOY_FILE_MRF, vcc->first_free_mrf + 1, 0)); const struct toy_src r0 = tsrc_ud(tsrc(TOY_FILE_GRF, 0, 0)); struct toy_compiler *tc = &vcc->tc; unsigned msg_type, msg_ctrl, msg_len; struct toy_inst *inst; struct toy_src desc; if (vs_lower_opcode_tgsi_const_pcb(vcc, dst, dim, idx)) return; /* set message header */ inst = tc_MOV(tc, header, r0); inst->mask_ctrl = GEN6_MASKCTRL_NOMASK; /* set block offsets */ tc_MOV(tc, block_offsets, idx); msg_type = GEN6_MSG_DP_OWORD_DUAL_BLOCK_READ; msg_ctrl = GEN6_MSG_DP_OWORD_DUAL_BLOCK_SIZE_1; msg_len = 2; desc = tsrc_imm_mdesc_data_port(tc, false, msg_len, 1, true, false, msg_type, msg_ctrl, vcc->shader->bt.const_base + dim); tc_SEND(tc, dst, tsrc_from(header), desc, vcc->const_cache); }
static void gs_ff_sync(struct gs_compile_context *gcc, struct toy_dst dst, struct toy_src num_prims) { struct toy_compiler *tc = &gcc->tc; struct toy_dst mrf_header = tdst_d(tdst(TOY_FILE_MRF, gcc->first_free_mrf, 0)); struct toy_src desc; bool allocate; gs_COPY8(tc, mrf_header, gcc->payload.header); /* set NumSOVertsToWrite and NumSOPrimsNeeded */ if (gcc->write_so) { if (num_prims.file == TOY_FILE_IMM) { const uint32_t v = (num_prims.val32 * gcc->in_vue_count) << 16 | num_prims.val32; gs_COPY1(tc, mrf_header, 0, tsrc_imm_d(v), 0); } else { struct toy_dst m0_0 = tdst_d(gcc->vars.tmp); tc_MUL(tc, m0_0, num_prims, tsrc_imm_d(gcc->in_vue_count << 16)); tc_OR(tc, m0_0, tsrc_from(m0_0), num_prims); gs_COPY1(tc, mrf_header, 0, tsrc_from(m0_0), 0); } } /* set NumGSPrimsGenerated */ if (gcc->write_vue) gs_COPY1(tc, mrf_header, 1, num_prims, 0); /* * From the Sandy Bridge PRM, volume 2 part 1, page 173: * * "Programming Note: If the GS stage is enabled, software must always * allocate at least one GS URB Entry. This is true even if the GS * thread never needs to output vertices to the pipeline, e.g., when * only performing stream output. This is an artifact of the need to * pass the GS thread an initial destination URB handle." */ allocate = true; desc = tsrc_imm_mdesc_urb(tc, false, 1, 1, false, false, allocate, false, 0, 1); tc_SEND(tc, dst, tsrc_from(mrf_header), desc, GEN6_SFID_URB); }
static void fs_lower_opcode_tgsi_const_gen6(struct fs_compile_context *fcc, struct toy_dst dst, int dim, struct toy_src idx) { const struct toy_dst header = tdst_ud(tdst(TOY_FILE_MRF, fcc->first_free_mrf, 0)); const struct toy_dst global_offset = tdst_ud(tdst(TOY_FILE_MRF, fcc->first_free_mrf, 2 * 4)); const struct toy_src r0 = tsrc_ud(tsrc(TOY_FILE_GRF, 0, 0)); struct toy_compiler *tc = &fcc->tc; unsigned msg_type, msg_ctrl, msg_len; struct toy_inst *inst; struct toy_src desc; struct toy_dst tmp, real_dst[4]; int i; /* set message header */ inst = tc_MOV(tc, header, r0); inst->mask_ctrl = BRW_MASK_DISABLE; /* set global offset */ inst = tc_MOV(tc, global_offset, idx); inst->mask_ctrl = BRW_MASK_DISABLE; inst->exec_size = BRW_EXECUTE_1; inst->src[0].rect = TOY_RECT_010; msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ; msg_ctrl = BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW << 8; msg_len = 1; desc = tsrc_imm_mdesc_data_port(tc, false, msg_len, 1, true, false, msg_type, msg_ctrl, ILO_WM_CONST_SURFACE(dim)); tmp = tc_alloc_tmp(tc); tc_SEND(tc, tmp, tsrc_from(header), desc, fcc->const_cache); tdst_transpose(dst, real_dst); for (i = 0; i < 4; i++) { const struct toy_src src = tsrc_offset(tsrc_rect(tsrc_from(tmp), TOY_RECT_010), 0, i); /* cast to type D to make sure these are raw moves */ tc_MOV(tc, tdst_d(real_dst[i]), tsrc_d(src)); } }
static void fs_lower_opcode_tgsi_const_gen7(struct fs_compile_context *fcc, struct toy_dst dst, int dim, struct toy_src idx) { struct toy_compiler *tc = &fcc->tc; const struct toy_dst offset = tdst_ud(tdst(TOY_FILE_MRF, fcc->first_free_mrf, 0)); struct toy_src desc; struct toy_inst *inst; struct toy_dst tmp, real_dst[4]; int i; /* * In 4c1fdae0a01b3f92ec03b61aac1d3df500d51fc6, pull constant load was * changed from OWord Block Read to ld to increase performance in the * classic driver. Since we use the constant cache instead of the data * cache, I wonder if we still want to follow the classic driver. */ /* set offset */ inst = tc_MOV(tc, offset, tsrc_rect(idx, TOY_RECT_010)); inst->exec_size = BRW_EXECUTE_8; inst->mask_ctrl = BRW_MASK_DISABLE; desc = tsrc_imm_mdesc_sampler(tc, 1, 1, false, BRW_SAMPLER_SIMD_MODE_SIMD4X2, GEN5_SAMPLER_MESSAGE_SAMPLE_LD, 0, ILO_WM_CONST_SURFACE(dim)); tmp = tc_alloc_tmp(tc); inst = tc_SEND(tc, tmp, tsrc_from(offset), desc, BRW_SFID_SAMPLER); inst->exec_size = BRW_EXECUTE_8; inst->mask_ctrl = BRW_MASK_DISABLE; tdst_transpose(dst, real_dst); for (i = 0; i < 4; i++) { const struct toy_src src = tsrc_offset(tsrc_rect(tsrc_from(tmp), TOY_RECT_010), 0, i); /* cast to type D to make sure these are raw moves */ tc_MOV(tc, tdst_d(real_dst[i]), tsrc_d(src)); } }
static void cs_dummy(struct cs_compile_context *ccc) { struct toy_compiler *tc = &ccc->tc; struct toy_dst header; struct toy_src r0, desc; struct toy_inst *inst; header = tdst_ud(tdst(TOY_FILE_MRF, ccc->first_free_mrf, 0)); r0 = tsrc_ud(tsrc(TOY_FILE_GRF, 0, 0)); inst = tc_MOV(tc, header, r0); inst->exec_size = GEN6_EXECSIZE_8; inst->mask_ctrl = GEN6_MASKCTRL_NOMASK; desc = tsrc_imm_mdesc(tc, true, 1, 0, true, GEN6_MSG_TS_RESOURCE_SELECT_NO_DEREF | GEN6_MSG_TS_REQUESTER_TYPE_ROOT | GEN6_MSG_TS_OPCODE_DEREF); tc_SEND(tc, tdst_null(), tsrc_from(header), desc, GEN6_SFID_SPAWNER); }