void vec4_tes_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) { const struct brw_tes_prog_data *tes_prog_data = (const struct brw_tes_prog_data *) prog_data; switch (instr->intrinsic) { case nir_intrinsic_load_tess_coord: /* gl_TessCoord is part of the payload in g1 channels 0-2 and 4-6. */ emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F), src_reg(brw_vec8_grf(1, 0)))); break; case nir_intrinsic_load_tess_level_outer: if (tes_prog_data->domain == BRW_TESS_DOMAIN_ISOLINE) { emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F), swizzle(src_reg(ATTR, 1, glsl_type::vec4_type), BRW_SWIZZLE_ZWZW))); } else { emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F), swizzle(src_reg(ATTR, 1, glsl_type::vec4_type), BRW_SWIZZLE_WZYX))); } break; case nir_intrinsic_load_tess_level_inner: if (tes_prog_data->domain == BRW_TESS_DOMAIN_QUAD) { emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F), swizzle(src_reg(ATTR, 0, glsl_type::vec4_type), BRW_SWIZZLE_WZYX))); } else { emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F), src_reg(ATTR, 1, glsl_type::float_type))); } break; case nir_intrinsic_load_primitive_id: emit(TES_OPCODE_GET_PRIMITIVE_ID, get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD)); break; case nir_intrinsic_load_input: case nir_intrinsic_load_per_vertex_input: { src_reg indirect_offset = get_indirect_offset(instr); unsigned imm_offset = instr->const_index[0]; src_reg header = input_read_header; bool is_64bit = nir_dest_bit_size(instr->dest) == 64; unsigned first_component = nir_intrinsic_component(instr); if (is_64bit) first_component /= 2; if (indirect_offset.file != BAD_FILE) { header = src_reg(this, glsl_type::uvec4_type); emit(TES_OPCODE_ADD_INDIRECT_URB_OFFSET, dst_reg(header), input_read_header, indirect_offset); } else { /* Arbitrarily only push up to 24 vec4 slots worth of data, * which is 12 registers (since each holds 2 vec4 slots). */ const unsigned max_push_slots = 24; if (imm_offset < max_push_slots) { const glsl_type *src_glsl_type = is_64bit ? glsl_type::dvec4_type : glsl_type::ivec4_type; src_reg src = src_reg(ATTR, imm_offset, src_glsl_type); src.swizzle = BRW_SWZ_COMP_INPUT(first_component); const brw_reg_type dst_reg_type = is_64bit ? BRW_REGISTER_TYPE_DF : BRW_REGISTER_TYPE_D; emit(MOV(get_nir_dest(instr->dest, dst_reg_type), src)); prog_data->urb_read_length = MAX2(prog_data->urb_read_length, DIV_ROUND_UP(imm_offset + (is_64bit ? 2 : 1), 2)); break; } } if (!is_64bit) { dst_reg temp(this, glsl_type::ivec4_type); vec4_instruction *read = emit(VEC4_OPCODE_URB_READ, temp, src_reg(header)); read->offset = imm_offset; read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET; src_reg src = src_reg(temp); src.swizzle = BRW_SWZ_COMP_INPUT(first_component); /* Copy to target. We might end up with some funky writemasks landing * in here, but we really don't want them in the above pseudo-ops. */ dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D); dst.writemask = brw_writemask_for_size(instr->num_components); emit(MOV(dst, src)); } else { /* For 64-bit we need to load twice as many 32-bit components, and for * dvec3/4 we need to emit 2 URB Read messages */ dst_reg temp(this, glsl_type::dvec4_type); dst_reg temp_d = retype(temp, BRW_REGISTER_TYPE_D); vec4_instruction *read = emit(VEC4_OPCODE_URB_READ, temp_d, src_reg(header)); read->offset = imm_offset; read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET; if (instr->num_components > 2) { read = emit(VEC4_OPCODE_URB_READ, byte_offset(temp_d, REG_SIZE), src_reg(header)); read->offset = imm_offset + 1; read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET; } src_reg temp_as_src = src_reg(temp); temp_as_src.swizzle = BRW_SWZ_COMP_INPUT(first_component); dst_reg shuffled(this, glsl_type::dvec4_type); shuffle_64bit_data(shuffled, temp_as_src, false); dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_DF); dst.writemask = brw_writemask_for_size(instr->num_components); emit(MOV(dst, src_reg(shuffled))); } break; } default: vec4_visitor::nir_emit_intrinsic(instr); } }
void gen6_gs_visitor::emit_thread_end() { /* Make sure the current primitive is ended: we know it is not ended when * first_vertex is not zero. This is only relevant for outputs other than * points because in the point case we set PrimEnd on all vertices. */ if (c->gp->program.OutputType != GL_POINTS) { emit(CMP(dst_null_d(), this->first_vertex, 0u, BRW_CONDITIONAL_Z)); emit(IF(BRW_PREDICATE_NORMAL)); { visit((ir_end_primitive *) NULL); } emit(BRW_OPCODE_ENDIF); } /* Here we have to: * 1) Emit an FF_SYNC messsage to obtain an initial VUE handle. * 2) Loop over all buffered vertex data and write it to corresponding * URB entries. * 3) Allocate new VUE handles for all vertices other than the first. * 4) Send a final EOT message. */ /* MRF 0 is reserved for the debugger, so start with message header * in MRF 1. */ int base_mrf = 1; /* In the process of generating our URB write message contents, we * may need to unspill a register or load from an array. Those * reads would use MRFs 14-15. */ int max_usable_mrf = 13; /* Issue the FF_SYNC message and obtain the initial VUE handle. */ emit(CMP(dst_null_d(), this->vertex_count, 0u, BRW_CONDITIONAL_G)); emit(IF(BRW_PREDICATE_NORMAL)); { this->current_annotation = "gen6 thread end: ff_sync"; vec4_instruction *inst; if (c->prog_data.gen6_xfb_enabled) { src_reg sol_temp(this, glsl_type::uvec4_type); emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES, dst_reg(this->svbi), this->vertex_count, this->prim_count, sol_temp); inst = emit(GS_OPCODE_FF_SYNC, dst_reg(this->temp), this->prim_count, this->svbi); } else { inst = emit(GS_OPCODE_FF_SYNC, dst_reg(this->temp), this->prim_count, src_reg(0u)); } inst->base_mrf = base_mrf; /* Loop over all buffered vertices and emit URB write messages */ this->current_annotation = "gen6 thread end: urb writes init"; src_reg vertex(this, glsl_type::uint_type); emit(MOV(dst_reg(vertex), 0u)); emit(MOV(dst_reg(this->vertex_output_offset), 0u)); this->current_annotation = "gen6 thread end: urb writes"; emit(BRW_OPCODE_DO); { emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE)); inst = emit(BRW_OPCODE_BREAK); inst->predicate = BRW_PREDICATE_NORMAL; /* First we prepare the message header */ emit_urb_write_header(base_mrf); /* Then add vertex data to the message in interleaved fashion */ int slot = 0; bool complete = false; do { int mrf = base_mrf + 1; /* URB offset is in URB row increments, and each of our MRFs is half * of one of those, since we're doing interleaved writes. */ int urb_offset = slot / 2; for (; slot < prog_data->vue_map.num_slots; ++slot) { int varying = prog_data->vue_map.slot_to_varying[slot]; current_annotation = output_reg_annotation[varying]; /* Compute offset of this slot for the current vertex * in vertex_output */ src_reg data(this->vertex_output); data.reladdr = ralloc(mem_ctx, src_reg); memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg)); /* Copy this slot to the appropriate message register */ dst_reg reg = dst_reg(MRF, mrf); reg.type = output_reg[varying].type; data.type = reg.type; vec4_instruction *inst = emit(MOV(reg, data)); inst->force_writemask_all = true; mrf++; emit(ADD(dst_reg(this->vertex_output_offset), this->vertex_output_offset, 1u)); /* If this was max_usable_mrf, we can't fit anything more into * this URB WRITE. */ if (mrf > max_usable_mrf) { slot++; break; } } complete = slot >= prog_data->vue_map.num_slots; emit_urb_write_opcode(complete, base_mrf, mrf, urb_offset); } while (!complete); /* Skip over the flags data item so that vertex_output_offset points * to the first data item of the next vertex, so that we can start * writing the next vertex. */ emit(ADD(dst_reg(this->vertex_output_offset), this->vertex_output_offset, 1u)); emit(ADD(dst_reg(vertex), vertex, 1u)); } emit(BRW_OPCODE_WHILE); if (c->prog_data.gen6_xfb_enabled) xfb_write(); } emit(BRW_OPCODE_ENDIF); /* Finally, emit EOT message. * * In gen6 we need to end the thread differently depending on whether we have * emitted at least one vertex or not. In case we did, the EOT message must * always include the COMPLETE flag or else the GPU hangs. If we have not * produced any output we can't use the COMPLETE flag. * * However, this would lead us to end the program with an ENDIF opcode, * which we want to avoid, so what we do is that we always request a new * VUE handle every time we do a URB WRITE, even for the last vertex we emit. * With this we make sure that whether we have emitted at least one vertex * or none at all, we have to finish the thread without writing to the URB, * which works for both cases by setting the COMPLETE and UNUSED flags in * the EOT message. */ this->current_annotation = "gen6 thread end: EOT"; if (c->prog_data.gen6_xfb_enabled) { /* When emitting EOT, set SONumPrimsWritten Increment Value. */ src_reg data(this, glsl_type::uint_type); emit(AND(dst_reg(data), this->sol_prim_written, src_reg(0xffffu))); emit(SHL(dst_reg(data), data, src_reg(16u))); emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data); } vec4_instruction *inst = emit(GS_OPCODE_THREAD_END); inst->urb_write_flags = BRW_URB_WRITE_COMPLETE | BRW_URB_WRITE_UNUSED; inst->base_mrf = base_mrf; inst->mlen = 1; }
void Jit::Comp_FPU2op(MIPSOpcode op) { CONDITIONAL_DISABLE; int fs = _FS; int fd = _FD; auto execRounding = [&](void (XEmitter::*conv)(X64Reg, OpArg), int setMXCSR) { fpr.SpillLock(fd, fs); fpr.MapReg(fd, fs == fd, true); // Small optimization: 0 is our default mode anyway. if (setMXCSR == 0 && !js.hasSetRounding) { setMXCSR = -1; } if (setMXCSR != -1) { STMXCSR(M(&mxcsrTemp)); MOV(32, R(TEMPREG), M(&mxcsrTemp)); AND(32, R(TEMPREG), Imm32(~(3 << 13))); OR(32, R(TEMPREG), Imm32(setMXCSR << 13)); MOV(32, M(&mips_->temp), R(TEMPREG)); LDMXCSR(M(&mips_->temp)); } (this->*conv)(TEMPREG, fpr.R(fs)); // Did we get an indefinite integer value? CMP(32, R(TEMPREG), Imm32(0x80000000)); FixupBranch skip = J_CC(CC_NE); if (fd != fs) { CopyFPReg(fpr.RX(fd), fpr.R(fs)); } XORPS(XMM1, R(XMM1)); CMPSS(fpr.RX(fd), R(XMM1), CMP_LT); // At this point, -inf = 0xffffffff, inf/nan = 0x00000000. // We want -inf to be 0x80000000 inf/nan to be 0x7fffffff, so we flip those bits. MOVD_xmm(R(TEMPREG), fpr.RX(fd)); XOR(32, R(TEMPREG), Imm32(0x7fffffff)); SetJumpTarget(skip); MOVD_xmm(fpr.RX(fd), R(TEMPREG)); if (setMXCSR != -1) { LDMXCSR(M(&mxcsrTemp)); } }; switch (op & 0x3f) { case 5: //F(fd) = fabsf(F(fs)); break; //abs fpr.SpillLock(fd, fs); fpr.MapReg(fd, fd == fs, true); if (fd != fs && fpr.IsMapped(fs)) { MOVAPS(fpr.RX(fd), M(ssNoSignMask)); ANDPS(fpr.RX(fd), fpr.R(fs)); } else { if (fd != fs) { MOVSS(fpr.RX(fd), fpr.R(fs)); } ANDPS(fpr.RX(fd), M(ssNoSignMask)); } break; case 6: //F(fd) = F(fs); break; //mov if (fd != fs) { fpr.SpillLock(fd, fs); fpr.MapReg(fd, fd == fs, true); CopyFPReg(fpr.RX(fd), fpr.R(fs)); } break; case 7: //F(fd) = -F(fs); break; //neg fpr.SpillLock(fd, fs); fpr.MapReg(fd, fd == fs, true); if (fd != fs && fpr.IsMapped(fs)) { MOVAPS(fpr.RX(fd), M(ssSignBits2)); XORPS(fpr.RX(fd), fpr.R(fs)); } else { if (fd != fs) { MOVSS(fpr.RX(fd), fpr.R(fs)); } XORPS(fpr.RX(fd), M(ssSignBits2)); } break; case 4: //F(fd) = sqrtf(F(fs)); break; //sqrt fpr.SpillLock(fd, fs); fpr.MapReg(fd, fd == fs, true); SQRTSS(fpr.RX(fd), fpr.R(fs)); break; case 13: //FsI(fd) = F(fs)>=0 ? (int)floorf(F(fs)) : (int)ceilf(F(fs)); break;//trunc.w.s execRounding(&XEmitter::CVTTSS2SI, -1); break; case 32: //F(fd) = (float)FsI(fs); break; //cvt.s.w fpr.SpillLock(fd, fs); fpr.MapReg(fd, fs == fd, true); if (fpr.IsMapped(fs)) { CVTDQ2PS(fpr.RX(fd), fpr.R(fs)); } else { // If fs was fd, we'd be in the case above since we mapped fd. MOVSS(fpr.RX(fd), fpr.R(fs)); CVTDQ2PS(fpr.RX(fd), fpr.R(fd)); } break; case 36: //FsI(fd) = (int) F(fs); break; //cvt.w.s // Uses the current rounding mode. execRounding(&XEmitter::CVTSS2SI, -1); break; case 12: //FsI(fd) = (int)floorf(F(fs)+0.5f); break; //round.w.s execRounding(&XEmitter::CVTSS2SI, 0); break; case 14: //FsI(fd) = (int)ceilf (F(fs)); break; //ceil.w.s execRounding(&XEmitter::CVTSS2SI, 2); break; case 15: //FsI(fd) = (int)floorf(F(fs)); break; //floor.w.s execRounding(&XEmitter::CVTSS2SI, 1); break; default: DISABLE; return; } fpr.ReleaseSpillLocks(); }
void gen8_vec4_generator::generate_vec4_instruction(vec4_instruction *instruction, struct brw_reg dst, struct brw_reg *src) { vec4_instruction *ir = (vec4_instruction *) instruction; if (dst.width == BRW_WIDTH_4) { /* This happens in attribute fixups for "dual instanced" geometry * shaders, since they use attributes that are vec4's. Since the exec * width is only 4, it's essential that the caller set * force_writemask_all in order to make sure the instruction is executed * regardless of which channels are enabled. */ assert(ir->force_writemask_all); /* Fix up any <8;8,1> or <0;4,1> source registers to <4;4,1> to satisfy * the following register region restrictions (from Graphics BSpec: * 3D-Media-GPGPU Engine > EU Overview > Registers and Register Regions * > Register Region Restrictions) * * 1. ExecSize must be greater than or equal to Width. * * 2. If ExecSize = Width and HorzStride != 0, VertStride must be set * to Width * HorzStride." */ for (int i = 0; i < 3; i++) { if (src[i].file == BRW_GENERAL_REGISTER_FILE) src[i] = stride(src[i], 4, 4, 1); } } switch (ir->opcode) { case BRW_OPCODE_MOV: MOV(dst, src[0]); break; case BRW_OPCODE_ADD: ADD(dst, src[0], src[1]); break; case BRW_OPCODE_MUL: MUL(dst, src[0], src[1]); break; case BRW_OPCODE_MACH: MACH(dst, src[0], src[1]); break; case BRW_OPCODE_MAD: MAD(dst, src[0], src[1], src[2]); break; case BRW_OPCODE_FRC: FRC(dst, src[0]); break; case BRW_OPCODE_RNDD: RNDD(dst, src[0]); break; case BRW_OPCODE_RNDE: RNDE(dst, src[0]); break; case BRW_OPCODE_RNDZ: RNDZ(dst, src[0]); break; case BRW_OPCODE_AND: AND(dst, src[0], src[1]); break; case BRW_OPCODE_OR: OR(dst, src[0], src[1]); break; case BRW_OPCODE_XOR: XOR(dst, src[0], src[1]); break; case BRW_OPCODE_NOT: NOT(dst, src[0]); break; case BRW_OPCODE_ASR: ASR(dst, src[0], src[1]); break; case BRW_OPCODE_SHR: SHR(dst, src[0], src[1]); break; case BRW_OPCODE_SHL: SHL(dst, src[0], src[1]); break; case BRW_OPCODE_CMP: CMP(dst, ir->conditional_mod, src[0], src[1]); break; case BRW_OPCODE_SEL: SEL(dst, src[0], src[1]); break; case BRW_OPCODE_DPH: DPH(dst, src[0], src[1]); break; case BRW_OPCODE_DP4: DP4(dst, src[0], src[1]); break; case BRW_OPCODE_DP3: DP3(dst, src[0], src[1]); break; case BRW_OPCODE_DP2: DP2(dst, src[0], src[1]); break; case BRW_OPCODE_F32TO16: F32TO16(dst, src[0]); break; case BRW_OPCODE_F16TO32: F16TO32(dst, src[0]); break; case BRW_OPCODE_LRP: LRP(dst, src[0], src[1], src[2]); break; case BRW_OPCODE_BFREV: /* BFREV only supports UD type for src and dst. */ BFREV(retype(dst, BRW_REGISTER_TYPE_UD), retype(src[0], BRW_REGISTER_TYPE_UD)); break; case BRW_OPCODE_FBH: /* FBH only supports UD type for dst. */ FBH(retype(dst, BRW_REGISTER_TYPE_UD), src[0]); break; case BRW_OPCODE_FBL: /* FBL only supports UD type for dst. */ FBL(retype(dst, BRW_REGISTER_TYPE_UD), src[0]); break; case BRW_OPCODE_CBIT: /* CBIT only supports UD type for dst. */ CBIT(retype(dst, BRW_REGISTER_TYPE_UD), src[0]); break; case BRW_OPCODE_ADDC: ADDC(dst, src[0], src[1]); break; case BRW_OPCODE_SUBB: SUBB(dst, src[0], src[1]); break; case BRW_OPCODE_BFE: BFE(dst, src[0], src[1], src[2]); break; case BRW_OPCODE_BFI1: BFI1(dst, src[0], src[1]); break; case BRW_OPCODE_BFI2: BFI2(dst, src[0], src[1], src[2]); break; case BRW_OPCODE_IF: IF(ir->predicate); break; case BRW_OPCODE_ELSE: ELSE(); break; case BRW_OPCODE_ENDIF: ENDIF(); break; case BRW_OPCODE_DO: DO(); break; case BRW_OPCODE_BREAK: BREAK(); break; case BRW_OPCODE_CONTINUE: CONTINUE(); break; case BRW_OPCODE_WHILE: WHILE(); break; case SHADER_OPCODE_RCP: MATH(BRW_MATH_FUNCTION_INV, dst, src[0]); break; case SHADER_OPCODE_RSQ: MATH(BRW_MATH_FUNCTION_RSQ, dst, src[0]); break; case SHADER_OPCODE_SQRT: MATH(BRW_MATH_FUNCTION_SQRT, dst, src[0]); break; case SHADER_OPCODE_EXP2: MATH(BRW_MATH_FUNCTION_EXP, dst, src[0]); break; case SHADER_OPCODE_LOG2: MATH(BRW_MATH_FUNCTION_LOG, dst, src[0]); break; case SHADER_OPCODE_SIN: MATH(BRW_MATH_FUNCTION_SIN, dst, src[0]); break; case SHADER_OPCODE_COS: MATH(BRW_MATH_FUNCTION_COS, dst, src[0]); break; case SHADER_OPCODE_POW: MATH(BRW_MATH_FUNCTION_POW, dst, src[0], src[1]); break; case SHADER_OPCODE_INT_QUOTIENT: MATH(BRW_MATH_FUNCTION_INT_DIV_QUOTIENT, dst, src[0], src[1]); break; case SHADER_OPCODE_INT_REMAINDER: MATH(BRW_MATH_FUNCTION_INT_DIV_REMAINDER, dst, src[0], src[1]); break; case SHADER_OPCODE_TEX: case SHADER_OPCODE_TXD: case SHADER_OPCODE_TXF: case SHADER_OPCODE_TXF_CMS: case SHADER_OPCODE_TXF_MCS: case SHADER_OPCODE_TXL: case SHADER_OPCODE_TXS: case SHADER_OPCODE_TG4: case SHADER_OPCODE_TG4_OFFSET: generate_tex(ir, dst); break; case VS_OPCODE_URB_WRITE: generate_urb_write(ir, true); break; case SHADER_OPCODE_GEN4_SCRATCH_READ: generate_scratch_read(ir, dst, src[0]); break; case SHADER_OPCODE_GEN4_SCRATCH_WRITE: generate_scratch_write(ir, dst, src[0], src[1]); break; case VS_OPCODE_PULL_CONSTANT_LOAD: case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: generate_pull_constant_load(ir, dst, src[0], src[1]); break; case GS_OPCODE_URB_WRITE: generate_urb_write(ir, false); break; case GS_OPCODE_THREAD_END: generate_gs_thread_end(ir); break; case GS_OPCODE_SET_WRITE_OFFSET: generate_gs_set_write_offset(dst, src[0], src[1]); break; case GS_OPCODE_SET_VERTEX_COUNT: generate_gs_set_vertex_count(dst, src[0]); break; case GS_OPCODE_SET_DWORD_2_IMMED: generate_gs_set_dword_2_immed(dst, src[0]); break; case GS_OPCODE_PREPARE_CHANNEL_MASKS: generate_gs_prepare_channel_masks(dst); break; case GS_OPCODE_SET_CHANNEL_MASKS: generate_gs_set_channel_masks(dst, src[0]); break; case SHADER_OPCODE_SHADER_TIME_ADD: assert(!"XXX: Missing Gen8 vec4 support for INTEL_DEBUG=shader_time"); break; case SHADER_OPCODE_UNTYPED_ATOMIC: assert(!"XXX: Missing Gen8 vec4 support for UNTYPED_ATOMIC"); break; case SHADER_OPCODE_UNTYPED_SURFACE_READ: assert(!"XXX: Missing Gen8 vec4 support for UNTYPED_SURFACE_READ"); break; case VS_OPCODE_UNPACK_FLAGS_SIMD4X2: assert(!"VS_OPCODE_UNPACK_FLAGS_SIMD4X2 should not be used on Gen8+."); break; default: if (ir->opcode < (int) ARRAY_SIZE(opcode_descs)) { _mesa_problem(ctx, "Unsupported opcode in `%s' in VS\n", opcode_descs[ir->opcode].name); } else { _mesa_problem(ctx, "Unsupported opcode %d in VS", ir->opcode); } abort(); } }
bool fs_visitor::opt_cse_local(bblock_t *block, exec_list *aeb) { bool progress = false; void *mem_ctx = ralloc_context(this->mem_ctx); int ip = block->start_ip; for (fs_inst *inst = (fs_inst *)block->start; inst != block->end->next; inst = (fs_inst *) inst->next) { /* Skip some cases. */ if (is_expression(inst) && !inst->predicate && !inst->is_partial_write() && !inst->conditional_mod && inst->dst.file != HW_REG) { bool found = false; aeb_entry *entry; foreach_list(entry_node, aeb) { entry = (aeb_entry *) entry_node; /* Match current instruction's expression against those in AEB. */ if (inst->opcode == entry->generator->opcode && inst->saturate == entry->generator->saturate && inst->dst.type == entry->generator->dst.type && operands_match(inst->opcode, entry->generator->src, inst->src)) { found = true; progress = true; break; } } if (!found) { /* Our first sighting of this expression. Create an entry. */ aeb_entry *entry = ralloc(mem_ctx, aeb_entry); entry->tmp = reg_undef; entry->generator = inst; aeb->push_tail(entry); } else { /* This is at least our second sighting of this expression. * If we don't have a temporary already, make one. */ bool no_existing_temp = entry->tmp.file == BAD_FILE; if (no_existing_temp) { int written = entry->generator->regs_written; fs_reg orig_dst = entry->generator->dst; fs_reg tmp = fs_reg(GRF, virtual_grf_alloc(written), orig_dst.type); entry->tmp = tmp; entry->generator->dst = tmp; for (int i = 0; i < written; i++) { fs_inst *copy = MOV(orig_dst, tmp); copy->force_writemask_all = entry->generator->force_writemask_all; entry->generator->insert_after(copy); orig_dst.reg_offset++; tmp.reg_offset++; } } /* dest <- temp */ int written = inst->regs_written; assert(written == entry->generator->regs_written); assert(inst->dst.type == entry->tmp.type); fs_reg dst = inst->dst; fs_reg tmp = entry->tmp; fs_inst *copy = NULL; for (int i = 0; i < written; i++) { copy = MOV(dst, tmp); copy->force_writemask_all = inst->force_writemask_all; inst->insert_before(copy); dst.reg_offset++; tmp.reg_offset++; } inst->remove(); /* Appending an instruction may have changed our bblock end. */ if (inst == block->end) { block->end = copy; } /* Continue iteration with copy->next */ inst = copy; } }
void JitArm::SafeStoreFromReg(bool fastmem, s32 dest, u32 value, s32 regOffset, int accessSize, s32 offset) { if (Core::g_CoreStartupParameter.bFastmem && fastmem) { ARMReg RA; ARMReg RB; ARMReg RS = gpr.R(value); if (dest != -1) RA = gpr.R(dest); if (regOffset != -1) { RB = gpr.R(regOffset); MOV(R10, RB); NOP(1); } else MOVI2R(R10, (u32)offset, false); if (dest != -1) ADD(R10, R10, RA); else NOP(1); MOV(R12, RS); UnsafeStoreFromReg(R10, R12, accessSize, 0); return; } ARMReg rA = gpr.GetReg(); ARMReg rB = gpr.GetReg(); ARMReg rC = gpr.GetReg(); ARMReg RA; ARMReg RB; if (dest != -1) RA = gpr.R(dest); if (regOffset != -1) RB = gpr.R(regOffset); ARMReg RS = gpr.R(value); switch(accessSize) { case 32: MOVI2R(rA, (u32)&Memory::Write_U32); break; case 16: MOVI2R(rA, (u32)&Memory::Write_U16); break; case 8: MOVI2R(rA, (u32)&Memory::Write_U8); break; } MOV(rB, RS); if (regOffset == -1) MOVI2R(rC, offset); else MOV(rC, RB); if (dest != -1) ADD(rC, rC, RA); PUSH(4, R0, R1, R2, R3); MOV(R0, rB); MOV(R1, rC); BL(rA); POP(4, R0, R1, R2, R3); gpr.Unlock(rA, rB, rC); }
void JitArm::stX(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStoreOff) u32 a = inst.RA, b = inst.RB, s = inst.RS; s32 offset = inst.SIMM_16; u32 accessSize = 0; s32 regOffset = -1; bool zeroA = true; bool update = false; bool fastmem = false; switch(inst.OPCD) { case 45: // sthu update = true; case 44: // sth accessSize = 16; break; case 31: switch (inst.SUBOP10) { case 183: // stwux zeroA = false; update = true; case 151: // stwx fastmem = true; accessSize = 32; regOffset = b; break; case 247: // stbux zeroA = false; update = true; case 215: // stbx accessSize = 8; regOffset = b; break; case 439: // sthux zeroA = false; update = true; case 407: // sthx accessSize = 16; regOffset = b; break; } break; case 37: // stwu update = true; case 36: // stw fastmem = true; accessSize = 32; break; case 39: // stbu update = true; case 38: // stb accessSize = 8; break; } SafeStoreFromReg(fastmem, zeroA ? a ? a : -1 : a, s, regOffset, accessSize, offset); if (update) { ARMReg rA = gpr.GetReg(); ARMReg RB; ARMReg RA = gpr.R(a); if (regOffset != -1) RB = gpr.R(regOffset); // Check for DSI exception prior to writing back address LDR(rA, R9, PPCSTATE_OFF(Exceptions)); CMP(rA, EXCEPTION_DSI); FixupBranch DoNotWrite = B_CC(CC_EQ); if (a) { if (regOffset == -1) MOVI2R(rA, offset); else MOV(rA, RB); ADD(RA, RA, rA); } else if (regOffset == -1) MOVI2R(RA, (u32)offset); else MOV(RA, RB); SetJumpTarget(DoNotWrite); gpr.Unlock(rA); } }
void Jit::Comp_mxc1(u32 op) { CONDITIONAL_DISABLE; int fs = _FS; int rt = _RT; switch ((op >> 21) & 0x1f) { case 0: // R(rt) = FI(fs); break; //mfc1 // Let's just go through RAM for now. fpr.FlushR(fs); gpr.MapReg(rt, MAP_DIRTY | MAP_NOINIT); LDR(gpr.R(rt), CTXREG, fpr.GetMipsRegOffset(fs)); return; case 2: //cfc1 if (fs == 31) { gpr.MapReg(rt, MAP_DIRTY | MAP_NOINIT); LDR(R0, CTXREG, offsetof(MIPSState, fpcond)); AND(R0, R0, Operand2(1)); // Just in case LDR(gpr.R(rt), CTXREG, offsetof(MIPSState, fcr31)); BIC(gpr.R(rt), gpr.R(rt), Operand2(0x1 << 23)); ORR(gpr.R(rt), gpr.R(rt), Operand2(R0, ST_LSL, 23)); } else if (fs == 0) { gpr.MapReg(rt, MAP_DIRTY | MAP_NOINIT); LDR(gpr.R(rt), CTXREG, offsetof(MIPSState, fcr0)); } return; case 4: //FI(fs) = R(rt); break; //mtc1 // Let's just go through RAM for now. gpr.FlushR(rt); fpr.MapReg(fs, MAP_DIRTY | MAP_NOINIT); VLDR(fpr.R(fs), CTXREG, gpr.GetMipsRegOffset(rt)); return; case 6: //ctc1 if (fs == 31) { gpr.MapReg(rt, 0); // Hardware rounding method. // Left here in case it is faster than conditional method. /* AND(R0, gpr.R(rt), Operand2(3)); // MIPS Rounding Mode <-> ARM Rounding Mode // 0, 1, 2, 3 <-> 0, 3, 1, 2 CMP(R0, Operand2(1)); SetCC(CC_EQ); ADD(R0, R0, Operand2(2)); SetCC(CC_GT); SUB(R0, R0, Operand2(1)); SetCC(CC_AL); // Load and Store RM to FPSCR VMRS(R1); BIC(R1, R1, Operand2(0x3 << 22)); ORR(R1, R1, Operand2(R0, ST_LSL, 22)); VMSR(R1); */ // Update MIPS state STR(gpr.R(rt), CTXREG, offsetof(MIPSState, fcr31)); MOV(R0, Operand2(gpr.R(rt), ST_LSR, 23)); AND(R0, R0, Operand2(1)); STR(R0, CTXREG, offsetof(MIPSState, fpcond)); } return; } }
void Jit64::reg_imm(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(Integer) u32 d = inst.RD, a = inst.RA, s = inst.RS; switch (inst.OPCD) { case 14: // addi // occasionally used as MOV - emulate, with immediate propagation if (gpr.R(a).IsImm() && d != a && a != 0) { gpr.SetImmediate32(d, (u32)gpr.R(a).offset + (u32)(s32)(s16)inst.SIMM_16); } else if (inst.SIMM_16 == 0 && d != a && a != 0) { gpr.Lock(a, d); gpr.BindToRegister(d, false, true); MOV(32, gpr.R(d), gpr.R(a)); gpr.UnlockAll(); } else { regimmop(d, a, false, (u32)(s32)inst.SIMM_16, Add, &XEmitter::ADD); //addi } break; case 15: if (a == 0) { // lis // Merge with next instruction if loading a 32-bits immediate value (lis + addi, lis + ori) if (!js.isLastInstruction && !Core::g_CoreStartupParameter.bEnableDebugging) { if ((js.next_inst.OPCD == 14) && (js.next_inst.RD == d) && (js.next_inst.RA == d)) { // addi gpr.SetImmediate32(d, ((u32)inst.SIMM_16 << 16) + (u32)(s32)js.next_inst.SIMM_16); js.downcountAmount++; js.skipnext = true; break; } else if ((js.next_inst.OPCD == 24) && (js.next_inst.RA == d) && (js.next_inst.RS == d)) { // ori gpr.SetImmediate32(d, ((u32)inst.SIMM_16 << 16) | (u32)js.next_inst.UIMM); js.downcountAmount++; js.skipnext = true; break; } } // Not merged regimmop(d, a, false, (u32)inst.SIMM_16 << 16, Add, &XEmitter::ADD); } else { // addis regimmop(d, a, false, (u32)inst.SIMM_16 << 16, Add, &XEmitter::ADD); } break; case 24: if (a == 0 && s == 0 && inst.UIMM == 0 && !inst.Rc) //check for nop {NOP(); return;} //make the nop visible in the generated code. not much use but interesting if we see one. regimmop(a, s, true, inst.UIMM, Or, &XEmitter::OR); break; //ori case 25: regimmop(a, s, true, inst.UIMM << 16, Or, &XEmitter::OR, false); break;//oris case 28: regimmop(a, s, true, inst.UIMM, And, &XEmitter::AND, true); break; case 29: regimmop(a, s, true, inst.UIMM << 16, And, &XEmitter::AND, true); break; case 26: regimmop(a, s, true, inst.UIMM, Xor, &XEmitter::XOR, false); break; //xori case 27: regimmop(a, s, true, inst.UIMM << 16, Xor, &XEmitter::XOR, false); break; //xoris case 12: regimmop(d, a, false, (u32)(s32)inst.SIMM_16, Add, &XEmitter::ADD, false, true); break; //addic case 13: regimmop(d, a, true, (u32)(s32)inst.SIMM_16, Add, &XEmitter::ADD, true, true); break; //addic_rc default: Default(inst); break; } }
const u8 *Jit::DoJit(u32 em_address, JitBlock *b) { js.cancel = false; js.blockStart = js.compilerPC = mips_->pc; js.nextExit = 0; js.downcountAmount = 0; js.curBlock = b; js.compiling = true; js.inDelaySlot = false; js.afterOp = JitState::AFTER_NONE; js.PrefixStart(); // We add a check before the block, used when entering from a linked block. b->checkedEntry = GetCodePtr(); // Downcount flag check. The last block decremented downcounter, and the flag should still be available. FixupBranch skip = J_CC(CC_NBE); MOV(32, M(&mips_->pc), Imm32(js.blockStart)); JMP(asm_.outerLoop, true); // downcount hit zero - go advance. SetJumpTarget(skip); b->normalEntry = GetCodePtr(); MIPSAnalyst::AnalysisResults analysis = MIPSAnalyst::Analyze(em_address); gpr.Start(mips_, analysis); fpr.Start(mips_, analysis); js.numInstructions = 0; while (js.compiling) { // Jit breakpoints are quite fast, so let's do them in release too. CheckJitBreakpoint(js.compilerPC, 0); MIPSOpcode inst = Memory::Read_Instruction(js.compilerPC); js.downcountAmount += MIPSGetInstructionCycleEstimate(inst); MIPSCompileOp(inst); if (js.afterOp & JitState::AFTER_CORE_STATE) { // TODO: Save/restore? FlushAll(); // If we're rewinding, CORE_NEXTFRAME should not cause a rewind. // It doesn't really matter either way if we're not rewinding. // CORE_RUNNING is <= CORE_NEXTFRAME. CMP(32, M((void*)&coreState), Imm32(CORE_NEXTFRAME)); FixupBranch skipCheck = J_CC(CC_LE); if (js.afterOp & JitState::AFTER_REWIND_PC_BAD_STATE) MOV(32, M(&mips_->pc), Imm32(js.compilerPC)); else MOV(32, M(&mips_->pc), Imm32(js.compilerPC + 4)); WriteSyscallExit(); SetJumpTarget(skipCheck); js.afterOp = JitState::AFTER_NONE; } js.compilerPC += 4; js.numInstructions++; // Safety check, in case we get a bunch of really large jit ops without a lot of branching. if (GetSpaceLeft() < 0x800) { FlushAll(); WriteExit(js.compilerPC, js.nextExit++); js.compiling = false; } } b->codeSize = (u32)(GetCodePtr() - b->normalEntry); NOP(); AlignCode4(); b->originalSize = js.numInstructions; return b->normalEntry; }
void gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts) { struct brw_gs_prog_data *prog_data = (struct brw_gs_prog_data *) &c->prog_data; unsigned binding; unsigned num_bindings = prog_data->num_transform_feedback_bindings; src_reg sol_temp(this, glsl_type::uvec4_type); /* Check for buffer overflow: we need room to write the complete primitive * (all vertices). Otherwise, avoid writing any vertices for it */ emit(ADD(dst_reg(sol_temp), this->sol_prim_written, 1u)); emit(MUL(dst_reg(sol_temp), sol_temp, src_reg(num_verts))); emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi)); emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE)); emit(IF(BRW_PREDICATE_NORMAL)); { /* Avoid overwriting MRF 1 as it is used as URB write message header */ dst_reg mrf_reg(MRF, 2); this->current_annotation = "gen6: emit SOL vertex data"; /* For each vertex, generate code to output each varying using the * appropriate binding table entry. */ for (binding = 0; binding < num_bindings; ++binding) { unsigned char varying = prog_data->transform_feedback_bindings[binding]; /* Set up the correct destination index for this vertex */ vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX, mrf_reg, this->destination_indices); inst->sol_vertex = vertex % num_verts; /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1: * * "Prior to End of Thread with a URB_WRITE, the kernel must * ensure that all writes are complete by sending the final * write as a committed write." */ bool final_write = binding == (unsigned) num_bindings - 1 && inst->sol_vertex == num_verts - 1; /* Compute offset of this varying for the current vertex * in vertex_output */ this->current_annotation = output_reg_annotation[varying]; src_reg data(this->vertex_output); data.reladdr = ralloc(mem_ctx, src_reg); int offset = get_vertex_output_offset_for_varying(vertex, varying); emit(MOV(dst_reg(this->vertex_output_offset), offset)); memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg)); data.type = output_reg[varying].type; /* PSIZ, LAYER and VIEWPORT are packed in different channels of the * same slot, so make sure we write the appropriate channel */ if (varying == VARYING_SLOT_PSIZ) data.swizzle = BRW_SWIZZLE_WWWW; else if (varying == VARYING_SLOT_LAYER) data.swizzle = BRW_SWIZZLE_YYYY; else if (varying == VARYING_SLOT_VIEWPORT) data.swizzle = BRW_SWIZZLE_ZZZZ; else data.swizzle = prog_data->transform_feedback_swizzles[binding]; /* Write data */ inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp); inst->sol_binding = binding; inst->sol_final_write = final_write; if (final_write) { /* This is the last vertex of the primitive, then increment * SO num primitive counter and destination indices. */ emit(ADD(dst_reg(this->destination_indices), this->destination_indices, src_reg(num_verts))); emit(ADD(dst_reg(this->sol_prim_written), this->sol_prim_written, 1u)); } } this->current_annotation = NULL; } emit(BRW_OPCODE_ENDIF); }
void gen6_gs_visitor::xfb_write() { unsigned num_verts; struct brw_gs_prog_data *prog_data = (struct brw_gs_prog_data *) &c->prog_data; if (!prog_data->num_transform_feedback_bindings) return; switch (c->prog_data.output_topology) { case _3DPRIM_POINTLIST: num_verts = 1; break; case _3DPRIM_LINELIST: case _3DPRIM_LINESTRIP: case _3DPRIM_LINELOOP: num_verts = 2; break; case _3DPRIM_TRILIST: case _3DPRIM_TRIFAN: case _3DPRIM_TRISTRIP: case _3DPRIM_RECTLIST: num_verts = 3; break; case _3DPRIM_QUADLIST: case _3DPRIM_QUADSTRIP: case _3DPRIM_POLYGON: num_verts = 3; break; default: unreachable("Unexpected primitive type in Gen6 SOL program."); } this->current_annotation = "gen6 thread end: svb writes init"; emit(MOV(dst_reg(this->vertex_output_offset), 0u)); emit(MOV(dst_reg(this->sol_prim_written), 0u)); /* Check that at least one primitive can be written * * Note: since we use the binding table to keep track of buffer offsets * and stride, the GS doesn't need to keep track of a separate pointer * into each buffer; it uses a single pointer which increments by 1 for * each vertex. So we use SVBI0 for this pointer, regardless of whether * transform feedback is in interleaved or separate attribs mode. */ src_reg sol_temp(this, glsl_type::uvec4_type); emit(ADD(dst_reg(sol_temp), this->svbi, src_reg(num_verts))); /* Compare SVBI calculated number with the maximum value, which is * in R1.4 (previously saved in this->max_svbi) for gen6. */ emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE)); emit(IF(BRW_PREDICATE_NORMAL)); { src_reg destination_indices_uw = retype(destination_indices, BRW_REGISTER_TYPE_UW); vec4_instruction *inst = emit(MOV(dst_reg(destination_indices_uw), brw_imm_v(0x00020100))); /* (0, 1, 2) */ inst->force_writemask_all = true; emit(ADD(dst_reg(this->destination_indices), this->destination_indices, this->svbi)); } emit(BRW_OPCODE_ENDIF); /* Write transform feedback data for all processed vertices. */ for (int i = 0; i < c->gp->program.VerticesOut; i++) { emit(MOV(dst_reg(sol_temp), i)); emit(CMP(dst_null_d(), sol_temp, this->vertex_count, BRW_CONDITIONAL_L)); emit(IF(BRW_PREDICATE_NORMAL)); { xfb_program(i, num_verts); } emit(BRW_OPCODE_ENDIF); } }
void gen6_gs_visitor::emit_prolog() { vec4_gs_visitor::emit_prolog(); /* Gen6 geometry shaders require to allocate an initial VUE handle via * FF_SYNC message, however the documentation remarks that only one thread * can write to the URB simultaneously and the FF_SYNC message provides the * synchronization mechanism for this, so using this message effectively * stalls the thread until it is its turn to write to the URB. Because of * this, the best way to implement geometry shader algorithms in gen6 is to * execute the algorithm before the FF_SYNC message to maximize parallelism. * * To achieve this we buffer the geometry shader outputs for each emitted * vertex in vertex_output during operation. Then, when we have processed * the last vertex (that is, at thread end time), we send the FF_SYNC * message to allocate the initial VUE handle and write all buffered vertex * data to the URB in one go. * * For each emitted vertex, vertex_output will hold vue_map.num_slots * data items plus one additional item to hold required flags * (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message) * which come right after the data items for that vertex. Vertex data and * flags for the next vertex come right after the data items and flags for * the previous vertex. */ this->current_annotation = "gen6 prolog"; this->vertex_output = src_reg(this, glsl_type::uint_type, (prog_data->vue_map.num_slots + 1) * c->gp->program.VerticesOut); this->vertex_output_offset = src_reg(this, glsl_type::uint_type); emit(MOV(dst_reg(this->vertex_output_offset), src_reg(0u))); /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES), * so initialize it once to R0. */ vec4_instruction *inst = emit(MOV(dst_reg(MRF, 1), retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD))); inst->force_writemask_all = true; /* This will be used as a temporary to store writeback data of FF_SYNC * and URB_WRITE messages. */ this->temp = src_reg(this, glsl_type::uint_type); /* This will be used to know when we are processing the first vertex of * a primitive. We will set this to URB_WRITE_PRIM_START only when we know * that we are processing the first vertex in the primitive and to zero * otherwise. This way we can use its value directly in the URB write * headers. */ this->first_vertex = src_reg(this, glsl_type::uint_type); emit(MOV(dst_reg(this->first_vertex), URB_WRITE_PRIM_START)); /* The FF_SYNC message requires to know the number of primitives generated, * so keep a counter for this. */ this->prim_count = src_reg(this, glsl_type::uint_type); emit(MOV(dst_reg(this->prim_count), 0u)); if (c->prog_data.gen6_xfb_enabled) { /* Create a virtual register to hold destination indices in SOL */ this->destination_indices = src_reg(this, glsl_type::uvec4_type); /* Create a virtual register to hold number of written primitives */ this->sol_prim_written = src_reg(this, glsl_type::uint_type); /* Create a virtual register to hold Streamed Vertex Buffer Indices */ this->svbi = src_reg(this, glsl_type::uvec4_type); /* Create a virtual register to hold max values of SVBI */ this->max_svbi = src_reg(this, glsl_type::uvec4_type); emit(MOV(dst_reg(this->max_svbi), src_reg(retype(brw_vec1_grf(1, 4), BRW_REGISTER_TYPE_UD)))); xfb_setup(); } /* PrimitveID is delivered in r0.1 of the thread payload. If the program * needs it we have to move it to a separate register where we can map * the atttribute. * * Notice that we cannot use a virtual register for this, because we need to * map all input attributes to hardware registers in setup_payload(), * which happens before virtual registers are mapped to hardware registers. * We could work around that issue if we were able to compute the first * non-payload register here and move the PrimitiveID information to that * register, but we can't because at this point we don't know the final * number uniforms that will be included in the payload. * * So, what we do is to place PrimitiveID information in r1, which is always * delivered as part of the payload, but its only populated with data * relevant for transform feedback when we set GEN6_GS_SVBI_PAYLOAD_ENABLE * in the 3DSTATE_GS state packet. That information can be obtained by other * means though, so we can safely use r1 for this purpose. */ if (c->prog_data.include_primitive_id) { this->primitive_id = src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id)); } }
bool SamplerJitCache::Jit_GetTexDataSwizzled(const SamplerID &id, int bitsPerTexel) { if (bitsPerTexel == 4) { // Specialized implementation. return Jit_GetTexDataSwizzled4(); } LEA(32, tempReg1, MScaled(vReg, SCALE_4, 0)); AND(32, R(tempReg1), Imm8(31)); AND(32, R(vReg), Imm8(~7)); MOV(32, R(tempReg2), R(uReg)); MOV(32, R(resultReg), R(uReg)); switch (bitsPerTexel) { case 32: SHR(32, R(resultReg), Imm8(2)); break; case 16: SHR(32, R(vReg), Imm8(1)); SHR(32, R(tempReg2), Imm8(1)); SHR(32, R(resultReg), Imm8(3)); break; case 8: SHR(32, R(vReg), Imm8(2)); SHR(32, R(tempReg2), Imm8(2)); SHR(32, R(resultReg), Imm8(4)); break; default: return false; } AND(32, R(tempReg2), Imm8(3)); SHL(32, R(resultReg), Imm8(5)); ADD(32, R(tempReg1), R(tempReg2)); ADD(32, R(tempReg1), R(resultReg)); // We may clobber srcReg in the MUL, so let's grab it now. LEA(64, tempReg1, MComplex(srcReg, tempReg1, SCALE_4, 0)); LEA(32, EAX, MScaled(bufwReg, SCALE_4, 0)); MUL(32, R(vReg)); switch (bitsPerTexel) { case 32: MOV(bitsPerTexel, R(resultReg), MRegSum(tempReg1, EAX)); break; case 16: AND(32, R(uReg), Imm8(1)); // Multiply by two by just adding twice. ADD(32, R(EAX), R(uReg)); ADD(32, R(EAX), R(uReg)); MOVZX(32, bitsPerTexel, resultReg, MRegSum(tempReg1, EAX)); break; case 8: AND(32, R(uReg), Imm8(3)); ADD(32, R(EAX), R(uReg)); MOVZX(32, bitsPerTexel, resultReg, MRegSum(tempReg1, EAX)); break; default: return false; } return true; }
void Jit64::GetCarryEAXAndClear() { MOV(32, R(EAX), M(&PowerPC::ppcState.spr[SPR_XER])); BTR(32, R(EAX), Imm8(29)); }
LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) { _assert_msg_(G3D, id.linear, "Linear should be set on sampler id"); BeginWrite(); // We'll first write the nearest sampler, which we will CALL. // This may differ slightly based on the "linear" flag. const u8 *nearest = AlignCode16(); if (!Jit_ReadTextureFormat(id)) { EndWrite(); SetCodePtr(const_cast<u8 *>(nearest)); return nullptr; } RET(); // Now the actual linear func, which is exposed externally. const u8 *start = AlignCode16(); // NOTE: This doesn't use the general register mapping. // POSIX: arg1=uptr, arg2=vptr, arg3=frac_u, arg4=frac_v, arg5=src, arg6=bufw, stack+8=level // Win64: arg1=uptr, arg2=vptr, arg3=frac_u, arg4=frac_v, stack+40=src, stack+48=bufw, stack+56=level // // We map these to nearest CALLs, with order: u, v, src, bufw, level // Let's start by saving a bunch of registers. PUSH(R15); PUSH(R14); PUSH(R13); PUSH(R12); // Won't need frac_u/frac_v for a while. PUSH(arg4Reg); PUSH(arg3Reg); // Extra space to restore alignment and save resultReg for lerp. // TODO: Maybe use XMMs instead? SUB(64, R(RSP), Imm8(24)); MOV(64, R(R12), R(arg1Reg)); MOV(64, R(R13), R(arg2Reg)); #ifdef _WIN32 // First arg now starts at 24 (extra space) + 48 (pushed stack) + 8 (ret address) + 32 (shadow space) const int argOffset = 24 + 48 + 8 + 32; MOV(64, R(R14), MDisp(RSP, argOffset)); MOV(32, R(R15), MDisp(RSP, argOffset + 8)); // level is at argOffset + 16. #else MOV(64, R(R14), R(arg5Reg)); MOV(32, R(R15), R(arg6Reg)); // level is at 24 + 48 + 8. #endif // Early exit on !srcPtr. FixupBranch zeroSrc; if (id.hasInvalidPtr) { CMP(PTRBITS, R(R14), Imm8(0)); FixupBranch nonZeroSrc = J_CC(CC_NZ); XOR(32, R(RAX), R(RAX)); zeroSrc = J(true); SetJumpTarget(nonZeroSrc); } // At this point: // R12=uptr, R13=vptr, stack+24=frac_u, stack+32=frac_v, R14=src, R15=bufw, stack+X=level auto doNearestCall = [&](int off) { MOV(32, R(uReg), MDisp(R12, off)); MOV(32, R(vReg), MDisp(R13, off)); MOV(64, R(srcReg), R(R14)); MOV(32, R(bufwReg), R(R15)); // Leave level, we just always load from RAM. Separate CLUTs is uncommon. CALL(nearest); MOV(32, MDisp(RSP, off), R(resultReg)); }; doNearestCall(0); doNearestCall(4); doNearestCall(8); doNearestCall(12); // Convert TL, TR, BL, BR to floats for easier blending. if (!cpu_info.bSSE4_1) { PXOR(XMM0, R(XMM0)); } MOVD_xmm(fpScratchReg1, MDisp(RSP, 0)); MOVD_xmm(fpScratchReg2, MDisp(RSP, 4)); MOVD_xmm(fpScratchReg3, MDisp(RSP, 8)); MOVD_xmm(fpScratchReg4, MDisp(RSP, 12)); if (cpu_info.bSSE4_1) { PMOVZXBD(fpScratchReg1, R(fpScratchReg1)); PMOVZXBD(fpScratchReg2, R(fpScratchReg2)); PMOVZXBD(fpScratchReg3, R(fpScratchReg3)); PMOVZXBD(fpScratchReg4, R(fpScratchReg4)); } else { PUNPCKLBW(fpScratchReg1, R(XMM0)); PUNPCKLBW(fpScratchReg2, R(XMM0)); PUNPCKLBW(fpScratchReg3, R(XMM0)); PUNPCKLBW(fpScratchReg4, R(XMM0)); PUNPCKLWD(fpScratchReg1, R(XMM0)); PUNPCKLWD(fpScratchReg2, R(XMM0)); PUNPCKLWD(fpScratchReg3, R(XMM0)); PUNPCKLWD(fpScratchReg4, R(XMM0)); } CVTDQ2PS(fpScratchReg1, R(fpScratchReg1)); CVTDQ2PS(fpScratchReg2, R(fpScratchReg2)); CVTDQ2PS(fpScratchReg3, R(fpScratchReg3)); CVTDQ2PS(fpScratchReg4, R(fpScratchReg4)); // Okay, now multiply the R sides by frac_u, and L by (256 - frac_u)... MOVD_xmm(fpScratchReg5, MDisp(RSP, 24)); CVTDQ2PS(fpScratchReg5, R(fpScratchReg5)); SHUFPS(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0)); if (RipAccessible(by256)) { MULPS(fpScratchReg5, M(by256)); // rip accessible } else { Crash(); // TODO } MOVAPS(XMM0, M(ones)); SUBPS(XMM0, R(fpScratchReg5)); MULPS(fpScratchReg1, R(XMM0)); MULPS(fpScratchReg2, R(fpScratchReg5)); MULPS(fpScratchReg3, R(XMM0)); MULPS(fpScratchReg4, R(fpScratchReg5)); // Now set top=fpScratchReg1, bottom=fpScratchReg3. ADDPS(fpScratchReg1, R(fpScratchReg2)); ADDPS(fpScratchReg3, R(fpScratchReg4)); // Next, time for frac_v. MOVD_xmm(fpScratchReg5, MDisp(RSP, 32)); CVTDQ2PS(fpScratchReg5, R(fpScratchReg5)); SHUFPS(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0)); MULPS(fpScratchReg5, M(by256)); MOVAPS(XMM0, M(ones)); SUBPS(XMM0, R(fpScratchReg5)); MULPS(fpScratchReg1, R(XMM0)); MULPS(fpScratchReg3, R(fpScratchReg5)); // Still at the 255 scale, now we're interpolated. ADDPS(fpScratchReg1, R(fpScratchReg3)); // Time to convert back to a single 32 bit value. CVTPS2DQ(fpScratchReg1, R(fpScratchReg1)); PACKSSDW(fpScratchReg1, R(fpScratchReg1)); PACKUSWB(fpScratchReg1, R(fpScratchReg1)); MOVD_xmm(R(resultReg), fpScratchReg1); if (id.hasInvalidPtr) { SetJumpTarget(zeroSrc); } ADD(64, R(RSP), Imm8(24)); POP(arg3Reg); POP(arg4Reg); POP(R12); POP(R13); POP(R14); POP(R15); RET(); EndWrite(); return (LinearFunc)start; }
void decodeInstruction(instruction_t instruction,unsigned long *r[],unsigned long *bandera,unsigned long *PC,unsigned long*LR,uint8_t*memoria,unsigned long *codificacion) { int auxban; unsigned long aux1,aux2,des; // codificacion funciones de la alu if(strcmp(instruction.mnemonic,"ADDS") == 0) { if(instruction.op1_type=='R') { if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' )) { r[instruction.op1_value]=ADD(r[instruction.op2_value],r[instruction.op3_value],&bandera); } if((instruction.op2_type== '#' )&&(instruction.op3_type =='R' )) { r[instruction.op1_value]=ADD(instruction.op2_value,r[instruction.op3_value],&bandera); } if((instruction.op2_type== 'R' )&&(instruction.op3_type =='#' )) { r[instruction.op1_value]=ADD(r[instruction.op2_value],instruction.op3_value,&bandera); } if((instruction.op2_type== '#' )&&(instruction.op3_type =='#' )) { r[instruction.op1_value]=ADD(instruction.op2_value,instruction.op3_value,&bandera); } mostrar(r[instruction.op1_value]); } if(instruction.op1_type=='N') { if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' )) { ADD(r[instruction.op2_value],r[instruction.op3_value],&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' )) { ADD(instruction.op2_value,r[instruction.op3_value],&bandera); } if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' )) { ADD(r[instruction.op2_value],instruction.op3_value,&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' )) { ADD(instruction.op2_value,instruction.op3_value,&bandera); } mostrar(r[instruction.op1_value]); } } if(strcmp(instruction.mnemonic,"CMN") == 0) { if((instruction.op1_type== 'R' )&&(instruction.op2_type =='R' )) { ADD(r[instruction.op1_value],r[instruction.op2_value],&bandera); } if((instruction.op1_type == '#' )&&(instruction.op2_type== 'R' )) { ADD(instruction.op1_value,r[instruction.op2_value],&bandera); } if((instruction.op1_type == 'R' )&&(instruction.op2_type == '#' )) { ADD(r[instruction.op1_value],instruction.op2_value,&bandera); } if((instruction.op1_type == '#' )&&(instruction.op2_type == '#' )) { ADD(instruction.op1_value,instruction.op2_value,&bandera); } } if( strcmp(instruction.mnemonic,"ADCS") == 0) { if(instruction.op1_type=='R') { if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' )) { r[instruction.op1_value]=ADC(r[instruction.op2_value],r[instruction.op3_value],&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' )) { r[instruction.op1_value]=ADC(instruction.op2_value,r[instruction.op3_value],&bandera); } if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' )) { r[instruction.op1_value]=ADC(r[instruction.op2_value],instruction.op3_value,&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' )) { r[instruction.op1_value]=ADC(instruction.op2_value,instruction.op3_value,&bandera); } mostrar(r[instruction.op1_value]); } if(instruction.op1_type=='N') { if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' )) { ADC(r[instruction.op2_value],r[instruction.op3_value],&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' )) { ADC(instruction.op2_value,r[instruction.op3_value],&bandera); } if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' )) { ADC(r[instruction.op2_value],instruction.op3_value,&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' )) { ADC(instruction.op2_value,instruction.op3_value,&bandera); } mostrar(r[instruction.op1_value]); } } if( strcmp(instruction.mnemonic,"ANDS") == 0) { if(instruction.op1_type=='R') { if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' )) { r[instruction.op1_value]=AND(r[instruction.op2_value],r[instruction.op3_value],&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' )) { r[instruction.op1_value]=AND(instruction.op2_value,r[instruction.op3_value],&bandera); } if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' )) { r[instruction.op1_value]=AND(r[instruction.op2_value],instruction.op3_value,&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' )) { r[instruction.op1_value]=AND(instruction.op2_value,instruction.op3_value,&bandera); } mostrar(r[instruction.op1_value]); } if(instruction.op1_type=='N') { if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' )) { AND(r[instruction.op2_value],r[instruction.op3_value],&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' )) { AND(instruction.op2_value,r[instruction.op3_value],&bandera); } if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' )) { AND(r[instruction.op2_value],instruction.op3_value,&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' )) { AND(instruction.op2_value,instruction.op3_value,&bandera); } mostrar(r[instruction.op1_value]); } } if(strcmp(instruction.mnemonic,"TEST") == 0) { if((instruction.op1_type== 'R' )&&(instruction.op2_type =='R' )) { AND(r[instruction.op1_value],r[instruction.op2_value],&bandera); } if((instruction.op1_type == '#' )&&(instruction.op2_type== 'R' )) { AND(instruction.op1_value,r[instruction.op2_value],&bandera); } if((instruction.op1_type == 'R' )&&(instruction.op2_type == '#' )) { AND(r[instruction.op1_value],instruction.op2_value,&bandera); } if((instruction.op1_type == '#' )&&(instruction.op2_type == '#' )) { AND(instruction.op1_value,instruction.op2_value,&bandera); } } if( strcmp(instruction.mnemonic,"EORS") == 0) { if(instruction.op1_type=='R') { if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' )) { r[instruction.op1_value]=EOR(r[instruction.op2_value],r[instruction.op3_value],&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' )) { r[instruction.op1_value]=EOR(instruction.op2_value,r[instruction.op3_value],&bandera); } if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' )) { r[instruction.op1_value]=EOR(r[instruction.op2_value],instruction.op3_value,&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' )) { r[instruction.op1_value]=EOR(instruction.op2_value,instruction.op3_value,&bandera); } mostrar(r[instruction.op1_value]); } if(instruction.op1_type=='N') { if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' )) { EOR(r[instruction.op2_value],r[instruction.op3_value],&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' )) { EOR(instruction.op2_value,r[instruction.op3_value],&bandera); } if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' )) { EOR(r[instruction.op2_value],instruction.op3_value,&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' )) { EOR(instruction.op2_value,instruction.op3_value,&bandera); } mostrar(r[instruction.op1_value]); } } if( (strcmp(instruction.mnemonic,"MOVS") == 0)||(strcmp(instruction.mnemonic,"MOV") == 0)) { if((instruction.op1_type == 'R')&&(instruction.op2_type=='R') ) { r[instruction.op1_value]=MOV(r[instruction.op1_value],r[instruction.op2_value],&bandera); mostrar(r[instruction.op1_value]); } if((instruction.op1_type == 'R')&&(instruction.op2_type=='#') ) { r[instruction.op1_value]=MOV(instruction.op1_value,instruction.op2_value,&bandera); mostrar(r[instruction.op1_value]); } } if( strcmp(instruction.mnemonic,"ORRS") == 0) { if(instruction.op1_type=='R') { if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' )) { r[instruction.op1_value]=ORR(r[instruction.op2_value],r[instruction.op3_value],&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' )) { r[instruction.op1_value]=ORR(instruction.op2_value,r[instruction.op3_value],&bandera); } if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' )) { r[instruction.op1_value]=ORR(r[instruction.op2_value],instruction.op3_value,&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' )) { r[instruction.op1_value]=ORR(instruction.op2_value,instruction.op3_value,&bandera); } mostrar(r[instruction.op1_value]); } if(instruction.op1_type=='N') { if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' )) { ORR(r[instruction.op2_value],r[instruction.op3_value],&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' )) { ORR(instruction.op2_value,r[instruction.op3_value],&bandera); } if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' )) { ORR(r[instruction.op2_value],instruction.op3_value,&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' )) { ORR(instruction.op2_value,instruction.op3_value,&bandera); } mostrar(r[instruction.op1_value]); } } if( strcmp(instruction.mnemonic,"SUBS") == 0) { if(instruction.op1_type== 'R' ) { if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' )) { r[instruction.op1_value]=SUB(r[instruction.op2_value],r[instruction.op3_value],&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' )) { r[instruction.op1_value]=SUB(instruction.op2_value,r[instruction.op3_value],&bandera); } if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' )) { r[instruction.op1_value]=SUB(r[instruction.op2_value],instruction.op3_value,&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' )) { r[instruction.op1_value]=SUB(instruction.op2_value,instruction.op3_value,&bandera); } mostrar(r[instruction.op1_value]); } if(instruction.op1_type=='N') { if((instruction.op2_type== 'R' )&&(instruction.op3_type =='R' )) { SUB(r[instruction.op2_value],r[instruction.op3_value],&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type== 'R' )) { SUB(instruction.op2_value,r[instruction.op3_value],&bandera); } if((instruction.op2_type == 'R' )&&(instruction.op3_type == '#' )) { SUB(r[instruction.op2_value],instruction.op3_value,&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' )) { SUB(instruction.op2_value,instruction.op3_value,&bandera); } mostrar(r[instruction.op1_value]); } } if(strcmp(instruction.mnemonic,"CMP") == 0) { if((instruction.op1_type== 'R' )&&(instruction.op2_type =='R' )) { SUB(r[instruction.op1_value],r[instruction.op2_value],&bandera); } if((instruction.op1_type == '#' )&&(instruction.op2_type== 'R' )) { SUB(instruction.op1_value,r[instruction.op2_value],&bandera); } if((instruction.op1_type == 'R' )&&(instruction.op2_type == '#' )) { SUB(r[instruction.op1_value],instruction.op2_value,&bandera); } if((instruction.op2_type == '#' )&&(instruction.op3_type == '#' )) { SUB(instruction.op1_value,instruction.op2_value,&bandera); } } // decodificacion funciones branch if(strcmp(instruction.mnemonic,"B")==0) { if(instruction.op1_type=='#') { *codificacion=(28<<11)+(instruction.op1_value); B(&PC,instruction.op1_value); } } if(strcmp(instruction.mnemonic,"BEQ")==0) { if(instruction.op1_type=='#') { *codificacion=(13<<11)+(instruction.op1_value); BEQ(&PC,instruction.op1_value,&bandera); } } if(strcmp(instruction.mnemonic,"BNE")==0) { if(instruction.op1_type=='#') { *codificacion=(13<<11)+(1<<8)+(instruction.op1_value); BNE(&PC,instruction.op1_value,&bandera); } } if(strcmp(instruction.mnemonic,"BCS")==0) { if(instruction.op1_type=='#') { *codificacion=(13<<11)+(2<<8)+(instruction.op1_value); BCS(&PC,instruction.op1_value,&bandera); } } if(strcmp(instruction.mnemonic,"BCC")==0) { if(instruction.op1_type=='#') { *codificacion=(13<<11)+(3<<8)+(instruction.op1_value); BCC(&PC,instruction.op1_value,&bandera); } } if(strcmp(instruction.mnemonic,"BMI")==0) { if(instruction.op1_type=='#') { *codificacion=(13<<11)+(4<<8)+(instruction.op1_value); BMI(&PC,instruction.op1_value,&bandera); } } if(strcmp(instruction.mnemonic,"BPL")==0) { if(instruction.op1_type=='#') { *codificacion=(13<<11)+(5<<8)+(instruction.op1_value); BPL(&PC,instruction.op1_value,&bandera); } } if(strcmp(instruction.mnemonic,"BVS")==0) { if(instruction.op1_type=='#') { *codificacion=(13<<11)+(6<<8)+(instruction.op1_value); BVS(&PC,instruction.op1_value,&bandera); } } if(strcmp(instruction.mnemonic,"BVC")==0) { if(instruction.op1_type=='#') { *codificacion=(13<<11)+(2<<7)+(instruction.op1_value); BVC(&PC,instruction.op1_value,&bandera); } } if(strcmp(instruction.mnemonic,"BHI")==0) { if(instruction.op1_type=='#') { *codificacion=(13<<11)+(8<<8)+(instruction.op1_value); BHI(&PC,instruction.op1_value,&bandera); } } if(strcmp(instruction.mnemonic,"BLS")==0) { if(instruction.op1_type=='#') { *codificacion=(13<<11)+(9<<8)+(instruction.op1_value); BLS(&PC,instruction.op1_value,&bandera); } } if(strcmp(instruction.mnemonic,"BGE")==0) { if(instruction.op1_type=='#') { *codificacion=(13<<11)+(10<<8)+(instruction.op1_value); BGE(&PC,instruction.op1_value,&bandera); } } if(strcmp(instruction.mnemonic,"BLT")==0) { if(instruction.op1_type=='#') { *codificacion=(13<<11)+(11<<8)+(instruction.op1_value); BLT(&PC,instruction.op1_value,&bandera); } } if(strcmp(instruction.mnemonic,"BGT")==0) { if(instruction.op1_type=='#') { *codificacion=(13<<11)+(12<<8)+(instruction.op1_value); BGT(&PC,instruction.op1_value,&bandera); } } if(strcmp(instruction.mnemonic,"BLE")==0) { if(instruction.op1_type=='#') { *codificacion=(13<<11)+(13<<8)+(instruction.op1_value); BLE(&PC,instruction.op1_value,&bandera); } } if(strcmp(instruction.mnemonic,"BL")==0) { if(instruction.op1_type=='#') { *codificacion=(31<<11)+(2047&instruction.op1_value+(((1<<31)&instruction.op1_value)>>20)); BL(&PC,instruction.op1_value,&LR); } }
void vec4_gs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) { dst_reg dest; src_reg src; switch (instr->intrinsic) { case nir_intrinsic_load_per_vertex_input: { /* The EmitNoIndirectInput flag guarantees our vertex index will * be constant. We should handle indirects someday. */ nir_const_value *vertex = nir_src_as_const_value(instr->src[0]); nir_const_value *offset = nir_src_as_const_value(instr->src[1]); /* Make up a type...we have no way of knowing... */ const glsl_type *const type = glsl_type::ivec(instr->num_components); src = src_reg(ATTR, BRW_VARYING_SLOT_COUNT * vertex->u32[0] + instr->const_index[0] + offset->u32[0], type); src.swizzle = BRW_SWZ_COMP_INPUT(nir_intrinsic_component(instr)); /* gl_PointSize is passed in the .w component of the VUE header */ if (instr->const_index[0] == VARYING_SLOT_PSIZ) src.swizzle = BRW_SWIZZLE_WWWW; dest = get_nir_dest(instr->dest, src.type); dest.writemask = brw_writemask_for_size(instr->num_components); emit(MOV(dest, src)); break; } case nir_intrinsic_load_input: unreachable("nir_lower_io should have produced per_vertex intrinsics"); case nir_intrinsic_emit_vertex_with_counter: { this->vertex_count = retype(get_nir_src(instr->src[0], 1), BRW_REGISTER_TYPE_UD); int stream_id = instr->const_index[0]; gs_emit_vertex(stream_id); break; } case nir_intrinsic_end_primitive_with_counter: this->vertex_count = retype(get_nir_src(instr->src[0], 1), BRW_REGISTER_TYPE_UD); gs_end_primitive(); break; case nir_intrinsic_set_vertex_count: this->vertex_count = retype(get_nir_src(instr->src[0], 1), BRW_REGISTER_TYPE_UD); break; case nir_intrinsic_load_primitive_id: assert(gs_prog_data->include_primitive_id); dest = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D); emit(MOV(dest, retype(brw_vec4_grf(1, 0), BRW_REGISTER_TYPE_D))); break; case nir_intrinsic_load_invocation_id: { src_reg invocation_id = src_reg(nir_system_values[SYSTEM_VALUE_INVOCATION_ID]); assert(invocation_id.file != BAD_FILE); dest = get_nir_dest(instr->dest, invocation_id.type); emit(MOV(dest, invocation_id)); break; } default: vec4_visitor::nir_emit_intrinsic(instr); } }
/** * Write out a batch of 32 control data bits from the control_data_bits * register to the URB. * * The current value of the vertex_count register determines which DWORD in * the URB receives the control data bits. The control_data_bits register is * assumed to contain the correct data for the vertex that was most recently * output, and all previous vertices that share the same DWORD. * * This function takes care of ensuring that if no vertices have been output * yet, no control bits are emitted. */ void vec4_gs_visitor::emit_control_data_bits() { assert(c->control_data_bits_per_vertex != 0); /* Since the URB_WRITE_OWORD message operates with 128-bit (vec4 sized) * granularity, we need to use two tricks to ensure that the batch of 32 * control data bits is written to the appropriate DWORD in the URB. To * select which vec4 we are writing to, we use the "slot {0,1} offset" * fields of the message header. To select which DWORD in the vec4 we are * writing to, we use the channel mask fields of the message header. To * avoid penalizing geometry shaders that emit a small number of vertices * with extra bookkeeping, we only do each of these tricks when * c->prog_data.control_data_header_size_bits is large enough to make it * necessary. * * Note: this means that if we're outputting just a single DWORD of control * data bits, we'll actually replicate it four times since we won't do any * channel masking. But that's not a problem since in this case the * hardware only pays attention to the first DWORD. */ enum brw_urb_write_flags urb_write_flags = BRW_URB_WRITE_OWORD; if (c->control_data_header_size_bits > 32) urb_write_flags = urb_write_flags | BRW_URB_WRITE_USE_CHANNEL_MASKS; if (c->control_data_header_size_bits > 128) urb_write_flags = urb_write_flags | BRW_URB_WRITE_PER_SLOT_OFFSET; /* If we are using either channel masks or a per-slot offset, then we * need to figure out which DWORD we are trying to write to, using the * formula: * * dword_index = (vertex_count - 1) * bits_per_vertex / 32 * * Since bits_per_vertex is a power of two, and is known at compile * time, this can be optimized to: * * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex)) */ src_reg dword_index(this, glsl_type::uint_type); if (urb_write_flags) { src_reg prev_count(this, glsl_type::uint_type); emit(ADD(dst_reg(prev_count), this->vertex_count, brw_imm_ud(0xffffffffu))); unsigned log2_bits_per_vertex = util_last_bit(c->control_data_bits_per_vertex); emit(SHR(dst_reg(dword_index), prev_count, brw_imm_ud(6 - log2_bits_per_vertex))); } /* Start building the URB write message. The first MRF gets a copy of * R0. */ int base_mrf = 1; dst_reg mrf_reg(MRF, base_mrf); src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); vec4_instruction *inst = emit(MOV(mrf_reg, r0)); inst->force_writemask_all = true; if (urb_write_flags & BRW_URB_WRITE_PER_SLOT_OFFSET) { /* Set the per-slot offset to dword_index / 4, to that we'll write to * the appropriate OWORD within the control data header. */ src_reg per_slot_offset(this, glsl_type::uint_type); emit(SHR(dst_reg(per_slot_offset), dword_index, brw_imm_ud(2u))); emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset, brw_imm_ud(1u)); } if (urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS) { /* Set the channel masks to 1 << (dword_index % 4), so that we'll * write to the appropriate DWORD within the OWORD. We need to do * this computation with force_writemask_all, otherwise garbage data * from invocation 0 might clobber the mask for invocation 1 when * GS_OPCODE_PREPARE_CHANNEL_MASKS tries to OR the two masks * together. */ src_reg channel(this, glsl_type::uint_type); inst = emit(AND(dst_reg(channel), dword_index, brw_imm_ud(3u))); inst->force_writemask_all = true; src_reg one(this, glsl_type::uint_type); inst = emit(MOV(dst_reg(one), brw_imm_ud(1u))); inst->force_writemask_all = true; src_reg channel_mask(this, glsl_type::uint_type); inst = emit(SHL(dst_reg(channel_mask), one, channel)); inst->force_writemask_all = true; emit(GS_OPCODE_PREPARE_CHANNEL_MASKS, dst_reg(channel_mask), channel_mask); emit(GS_OPCODE_SET_CHANNEL_MASKS, mrf_reg, channel_mask); } /* Store the control data bits in the message payload and send it. */ dst_reg mrf_reg2(MRF, base_mrf + 1); inst = emit(MOV(mrf_reg2, this->control_data_bits)); inst->force_writemask_all = true; inst = emit(GS_OPCODE_URB_WRITE); inst->urb_write_flags = urb_write_flags; /* We need to increment Global Offset by 256-bits to make room for * Broadwell's extra "Vertex Count" payload at the beginning of the * URB entry. Since this is an OWord message, Global Offset is counted * in 128-bit units, so we must set it to 2. */ if (devinfo->gen >= 8 && gs_prog_data->static_vertex_count == -1) inst->offset = 2; inst->base_mrf = base_mrf; inst->mlen = 2; }
void JitArm::lXX(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStoreOff) u32 a = inst.RA, b = inst.RB, d = inst.RD; s32 offset = inst.SIMM_16; u32 accessSize = 0; s32 offsetReg = -1; bool zeroA = true; bool update = false; bool signExtend = false; bool reverse = false; bool fastmem = false; switch(inst.OPCD) { case 31: switch(inst.SUBOP10) { case 55: // lwzux zeroA = false; update = true; case 23: // lwzx accessSize = 32; offsetReg = b; break; case 119: //lbzux zeroA = false; update = true; case 87: // lbzx accessSize = 8; offsetReg = b; break; case 311: // lhzux zeroA = false; update = true; case 279: // lhzx accessSize = 16; offsetReg = b; break; case 375: // lhaux zeroA = false; update = true; case 343: // lhax accessSize = 16; signExtend = true; offsetReg = b; break; case 534: // lwbrx accessSize = 32; reverse = true; break; case 790: // lhbrx accessSize = 16; reverse = true; break; } break; case 33: // lwzu zeroA = false; update = true; case 32: // lwz fastmem = true; accessSize = 32; break; case 35: // lbzu zeroA = false; update = true; case 34: // lbz fastmem = true; accessSize = 8; break; case 41: // lhzu zeroA = false; update = true; case 40: // lhz fastmem = true; accessSize = 16; break; case 43: // lhau zeroA = false; update = true; case 42: // lha signExtend = true; accessSize = 16; break; } // Check for exception before loading ARMReg rA = gpr.GetReg(false); LDR(rA, R9, PPCSTATE_OFF(Exceptions)); CMP(rA, EXCEPTION_DSI); FixupBranch DoNotLoad = B_CC(CC_EQ); SafeLoadToReg(fastmem, d, zeroA ? a ? a : -1 : a, offsetReg, accessSize, offset, signExtend, reverse); if (update) { rA = gpr.GetReg(false); ARMReg RA = gpr.R(a); if (offsetReg == -1) MOVI2R(rA, offset); else MOV(RA, gpr.R(offsetReg)); ADD(RA, RA, rA); } SetJumpTarget(DoNotLoad); // LWZ idle skipping if (SConfig::GetInstance().m_LocalCoreStartupParameter.bSkipIdle && inst.OPCD == 32 && (inst.hex & 0xFFFF0000) == 0x800D0000 && (Memory::ReadUnchecked_U32(js.compilerPC + 4) == 0x28000000 || (SConfig::GetInstance().m_LocalCoreStartupParameter.bWii && Memory::ReadUnchecked_U32(js.compilerPC + 4) == 0x2C000000)) && Memory::ReadUnchecked_U32(js.compilerPC + 8) == 0x4182fff8) { ARMReg RD = gpr.R(d); gpr.Flush(); fpr.Flush(); // if it's still 0, we can wait until the next event TST(RD, RD); FixupBranch noIdle = B_CC(CC_NEQ); rA = gpr.GetReg(); MOVI2R(rA, (u32)&PowerPC::OnIdle); MOVI2R(R0, PowerPC::ppcState.gpr[a] + (s32)(s16)inst.SIMM_16); BL(rA); gpr.Unlock(rA); WriteExceptionExit(); SetJumpTarget(noIdle); //js.compilerPC += 8; return; } }
void vec4_gs_visitor::gs_emit_vertex(int stream_id) { this->current_annotation = "emit vertex: safety check"; /* Haswell and later hardware ignores the "Render Stream Select" bits * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled, * and instead sends all primitives down the pipeline for rasterization. * If the SOL stage is enabled, "Render Stream Select" is honored and * primitives bound to non-zero streams are discarded after stream output. * * Since the only purpose of primives sent to non-zero streams is to * be recorded by transform feedback, we can simply discard all geometry * bound to these streams when transform feedback is disabled. */ if (stream_id > 0 && !nir->info->has_transform_feedback_varyings) return; /* If we're outputting 32 control data bits or less, then we can wait * until the shader is over to output them all. Otherwise we need to * output them as we go. Now is the time to do it, since we're about to * output the vertex_count'th vertex, so it's guaranteed that the * control data bits associated with the (vertex_count - 1)th vertex are * correct. */ if (c->control_data_header_size_bits > 32) { this->current_annotation = "emit vertex: emit control data bits"; /* Only emit control data bits if we've finished accumulating a batch * of 32 bits. This is the case when: * * (vertex_count * bits_per_vertex) % 32 == 0 * * (in other words, when the last 5 bits of vertex_count * * bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some * integer n (which is always the case, since bits_per_vertex is * always 1 or 2), this is equivalent to requiring that the last 5-n * bits of vertex_count are 0: * * vertex_count & (2^(5-n) - 1) == 0 * * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is * equivalent to: * * vertex_count & (32 / bits_per_vertex - 1) == 0 */ vec4_instruction *inst = emit(AND(dst_null_ud(), this->vertex_count, brw_imm_ud(32 / c->control_data_bits_per_vertex - 1))); inst->conditional_mod = BRW_CONDITIONAL_Z; emit(IF(BRW_PREDICATE_NORMAL)); { /* If vertex_count is 0, then no control data bits have been * accumulated yet, so we skip emitting them. */ emit(CMP(dst_null_ud(), this->vertex_count, brw_imm_ud(0u), BRW_CONDITIONAL_NEQ)); emit(IF(BRW_PREDICATE_NORMAL)); emit_control_data_bits(); emit(BRW_OPCODE_ENDIF); /* Reset control_data_bits to 0 so we can start accumulating a new * batch. * * Note: in the case where vertex_count == 0, this neutralizes the * effect of any call to EndPrimitive() that the shader may have * made before outputting its first vertex. */ inst = emit(MOV(dst_reg(this->control_data_bits), brw_imm_ud(0u))); inst->force_writemask_all = true; } emit(BRW_OPCODE_ENDIF); } this->current_annotation = "emit vertex: vertex data"; emit_vertex(); /* In stream mode we have to set control data bits for all vertices * unless we have disabled control data bits completely (which we do * do for GL_POINTS outputs that don't use streams). */ if (c->control_data_header_size_bits > 0 && gs_prog_data->control_data_format == GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) { this->current_annotation = "emit vertex: Stream control data bits"; set_stream_control_data_bits(stream_id); } this->current_annotation = NULL; }
int main(int argc, const char * argv[]) { check_same("Mov literal", 3, Asm<int>( MOV(eax, 3_d), RET())() ); check_same("64 bit register MOV", 6, Asm<int>( MOV(rax, 6_q), RET())() ); check_same("Negative literal", -103, Asm<int>( MOV(eax, -3_d), ADD(eax, - - -100_d), RET())() ); check_same("Move reg to reg", 4, Asm<int>( MOV(ecx, 4_d), MOV(eax, ecx), RET())() ); check_same("Simple jmp", 3, Asm<int>( MOV(eax, 3_d), JMP("a"_rel8), ADD(eax, 2_d), "a"_label, RET())() ); check_same("Simple loop", 30, Asm<int>( MOV(ecx, 5_d), MOV(eax, 0_d), "start"_label, CMP(ecx, 0_d), JE("done"_rel8), ADD(eax, 6_d), DEC(ecx), JMP("start"_rel8), "done"_label, RET())() ); check_same("Macro simple loop", 30, Asm<int>( MOV(eax, 0_d), do_x_times(5_d, ADD(eax, 6_d)), RET())() ); check_same("Access arg using esp", 1, Asm<int>( MOV(eax, _[esp + 28_d]), RET())(1, 2, 3) ); check_same("Access arg using ebp", 1, Asm<int>( MOV(eax, _[ebp - 0xc_b]), RET())(1, 2, 3) ); check_same("Index ebp", 1, Asm<int>( MOV(ecx, 2_d), MOV(eax, _[ebp + ecx * 2_b - 0x10_d]), RET())(1, 2, 3) ); check_same("Access args using ebp", 5, Asm<int>( MOV(edx, 0_d), MOV(eax, _[ebp - 0xc_b]), MOV(ecx, _[ebp - 0x10_b]), DIV(ecx), MOV(ecx, _[ebp - 0x14_b]), DIV(ecx), RET())(100, 5, 4) ); check_same("Access arg with 64 bit reg", 2, Asm<int>( MOV(rax, _[rsp + 24_d]), RET())(1, 2, 3) ); check_same("Access second register zero", 1, Asm<int>( MOV(ecx, 0_d), MOV(eax, _[esp + 28_d + ecx]), RET())(1, 2, 3) ); check_same("Access second register with offset", 1, Asm<int>( MOV(ecx, 8_d), MOV(eax, _[esp + 20_d + ecx]), RET())(1, 2, 3) ); check_same("Access second register with offset and 1 scale", 1, Asm<int>( MOV(ecx, 8_d), MOV(eax, _[esp + 20_d + ecx * 1_b]), RET())(1, 2, 3) ); check_same("Access second register with offset and 4 scale", 1, Asm<int>( MOV(ecx, 2_d), MOV(eax, _[esp + 20_d + ecx * 4_b]), RET())(1, 2, 3) ); check_same("Call c function from assembly", 66, Asm<int>( MOV(rbx, _[rsp + 8_d]), CALL(rbx), RET())(&ret66) ); check_same("Call c function from esp directly", 66, Asm<int>( CALL(_[rsp + 8_d]), RET())(&ret66) ); check_same("Call c function from ebp directly", 66, Asm<int>( CALL(_[rbp - 0x10_d]), RET())(&ret66) ); // auto p = Asm<int>(CALL(_[rbp - 0xc_d])); // Print<decltype(p)::program> x{}; std::cout << "done" << std::endl; return 0; }
void JitArm::ps_cmpo1(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITFloatingPointOff); u32 a = inst.FA, b = inst.FB; int cr = inst.CRFD; ARMReg vA = fpr.R1(a); ARMReg vB = fpr.R1(b); ARMReg fpscrReg = gpr.GetReg(); ARMReg crReg = gpr.GetReg(); Operand2 FPRFMask(0x1F, 0xA); // 0x1F000 Operand2 LessThan(0x8, 0xA); // 0x8000 Operand2 GreaterThan(0x4, 0xA); // 0x4000 Operand2 EqualTo(0x2, 0xA); // 0x2000 Operand2 NANRes(0x1, 0xA); // 0x1000 FixupBranch Done1, Done2, Done3; LDR(fpscrReg, R9, PPCSTATE_OFF(fpscr)); BIC(fpscrReg, fpscrReg, FPRFMask); VCMPE(vA, vB); VMRS(_PC); SetCC(CC_LT); ORR(fpscrReg, fpscrReg, LessThan); MOV(crReg, 8); Done1 = B(); SetCC(CC_GT); ORR(fpscrReg, fpscrReg, GreaterThan); MOV(crReg, 4); Done2 = B(); SetCC(CC_EQ); ORR(fpscrReg, fpscrReg, EqualTo); MOV(crReg, 2); Done3 = B(); SetCC(); ORR(fpscrReg, fpscrReg, NANRes); MOV(crReg, 1); VCMPE(vA, vA); VMRS(_PC); FixupBranch NanA = B_CC(CC_NEQ); VCMPE(vB, vB); VMRS(_PC); FixupBranch NanB = B_CC(CC_NEQ); SetFPException(fpscrReg, FPSCR_VXVC); FixupBranch Done4 = B(); SetJumpTarget(NanA); SetJumpTarget(NanB); SetFPException(fpscrReg, FPSCR_VXSNAN); TST(fpscrReg, VEMask); FixupBranch noVXVC = B_CC(CC_NEQ); SetFPException(fpscrReg, FPSCR_VXVC); SetJumpTarget(noVXVC); SetJumpTarget(Done1); SetJumpTarget(Done2); SetJumpTarget(Done3); SetJumpTarget(Done4); STRB(crReg, R9, PPCSTATE_OFF(cr_fast) + cr); STR(fpscrReg, R9, PPCSTATE_OFF(fpscr)); gpr.Unlock(fpscrReg, crReg); }
int GGLAssembler::scanline_core(const needs_t& needs, context_t const* c) { int64_t duration = ggl_system_time(); mBlendFactorCached = 0; mBlending = 0; mMasking = 0; mAA = GGL_READ_NEEDS(P_AA, needs.p); mDithering = GGL_READ_NEEDS(P_DITHER, needs.p); mAlphaTest = GGL_READ_NEEDS(P_ALPHA_TEST, needs.p) + GGL_NEVER; mDepthTest = GGL_READ_NEEDS(P_DEPTH_TEST, needs.p) + GGL_NEVER; mFog = GGL_READ_NEEDS(P_FOG, needs.p) != 0; mSmooth = GGL_READ_NEEDS(SHADE, needs.n) != 0; mBuilderContext.needs = needs; mBuilderContext.c = c; mBuilderContext.Rctx = reserveReg(R0); // context always in R0 mCbFormat = c->formats[ GGL_READ_NEEDS(CB_FORMAT, needs.n) ]; // ------------------------------------------------------------------------ decodeLogicOpNeeds(needs); decodeTMUNeeds(needs, c); mBlendSrc = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_SRC, needs.n)); mBlendDst = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_DST, needs.n)); mBlendSrcA = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_SRCA, needs.n)); mBlendDstA = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_DSTA, needs.n)); if (!mCbFormat.c[GGLFormat::ALPHA].h) { if ((mBlendSrc == GGL_ONE_MINUS_DST_ALPHA) || (mBlendSrc == GGL_DST_ALPHA)) { mBlendSrc = GGL_ONE; } if ((mBlendSrcA == GGL_ONE_MINUS_DST_ALPHA) || (mBlendSrcA == GGL_DST_ALPHA)) { mBlendSrcA = GGL_ONE; } if ((mBlendDst == GGL_ONE_MINUS_DST_ALPHA) || (mBlendDst == GGL_DST_ALPHA)) { mBlendDst = GGL_ONE; } if ((mBlendDstA == GGL_ONE_MINUS_DST_ALPHA) || (mBlendDstA == GGL_DST_ALPHA)) { mBlendDstA = GGL_ONE; } } // if we need the framebuffer, read it now const int blending = blending_codes(mBlendSrc, mBlendDst) | blending_codes(mBlendSrcA, mBlendDstA); // XXX: handle special cases, destination not modified... if ((mBlendSrc==GGL_ZERO) && (mBlendSrcA==GGL_ZERO) && (mBlendDst==GGL_ONE) && (mBlendDstA==GGL_ONE)) { // Destination unmodified (beware of logic ops) } else if ((mBlendSrc==GGL_ZERO) && (mBlendSrcA==GGL_ZERO) && (mBlendDst==GGL_ZERO) && (mBlendDstA==GGL_ZERO)) { // Destination is zero (beware of logic ops) } int fbComponents = 0; const int masking = GGL_READ_NEEDS(MASK_ARGB, needs.n); for (int i=0 ; i<4 ; i++) { const int mask = 1<<i; component_info_t& info = mInfo[i]; int fs = i==GGLFormat::ALPHA ? mBlendSrcA : mBlendSrc; int fd = i==GGLFormat::ALPHA ? mBlendDstA : mBlendDst; if (fs==GGL_SRC_ALPHA_SATURATE && i==GGLFormat::ALPHA) fs = GGL_ONE; info.masked = !!(masking & mask); info.inDest = !info.masked && mCbFormat.c[i].h && ((mLogicOp & LOGIC_OP_SRC) || (!mLogicOp)); if (mCbFormat.components >= GGL_LUMINANCE && (i==GGLFormat::GREEN || i==GGLFormat::BLUE)) { info.inDest = false; } info.needed = (i==GGLFormat::ALPHA) && (isAlphaSourceNeeded() || mAlphaTest != GGL_ALWAYS); info.replaced = !!(mTextureMachine.replaced & mask); info.iterated = (!info.replaced && (info.inDest || info.needed)); info.smooth = mSmooth && info.iterated; info.fog = mFog && info.inDest && (i != GGLFormat::ALPHA); info.blend = (fs != int(GGL_ONE)) || (fd > int(GGL_ZERO)); mBlending |= (info.blend ? mask : 0); mMasking |= (mCbFormat.c[i].h && info.masked) ? mask : 0; fbComponents |= mCbFormat.c[i].h ? mask : 0; } mAllMasked = (mMasking == fbComponents); if (mAllMasked) { mDithering = 0; } fragment_parts_t parts; // ------------------------------------------------------------------------ prolog(); // ------------------------------------------------------------------------ build_scanline_prolog(parts, needs); if (registerFile().status()) return registerFile().status(); // ------------------------------------------------------------------------ label("fragment_loop"); // ------------------------------------------------------------------------ { Scratch regs(registerFile()); if (mDithering) { // update the dither index. MOV(AL, 0, parts.count.reg, reg_imm(parts.count.reg, ROR, GGL_DITHER_ORDER_SHIFT)); ADD(AL, 0, parts.count.reg, parts.count.reg, imm( 1 << (32 - GGL_DITHER_ORDER_SHIFT))); MOV(AL, 0, parts.count.reg, reg_imm(parts.count.reg, ROR, 32 - GGL_DITHER_ORDER_SHIFT)); } // XXX: could we do an early alpha-test here in some cases? // It would probaly be used only with smooth-alpha and no texture // (or no alpha component in the texture). // Early z-test if (mAlphaTest==GGL_ALWAYS) { build_depth_test(parts, Z_TEST|Z_WRITE); } else { // we cannot do the z-write here, because // it might be killed by the alpha-test later build_depth_test(parts, Z_TEST); } { // texture coordinates Scratch scratches(registerFile()); // texel generation build_textures(parts, regs); } if ((blending & (FACTOR_DST|BLEND_DST)) || (mMasking && !mAllMasked) || (mLogicOp & LOGIC_OP_DST)) { // blending / logic_op / masking need the framebuffer mDstPixel.setTo(regs.obtain(), &mCbFormat); // load the framebuffer pixel comment("fetch color-buffer"); load(parts.cbPtr, mDstPixel); } if (registerFile().status()) return registerFile().status(); pixel_t pixel; int directTex = mTextureMachine.directTexture; if (directTex | parts.packed) { // note: we can't have both here // iterated color or direct texture pixel = directTex ? parts.texel[directTex-1] : parts.iterated; pixel.flags &= ~CORRUPTIBLE; } else { if (mDithering) { const int ctxtReg = mBuilderContext.Rctx; const int mask = GGL_DITHER_SIZE-1; parts.dither = reg_t(regs.obtain()); AND(AL, 0, parts.dither.reg, parts.count.reg, imm(mask)); ADD(AL, 0, parts.dither.reg, parts.dither.reg, ctxtReg); LDRB(AL, parts.dither.reg, parts.dither.reg, immed12_pre(GGL_OFFSETOF(ditherMatrix))); } // allocate a register for the resulting pixel pixel.setTo(regs.obtain(), &mCbFormat, FIRST); build_component(pixel, parts, GGLFormat::ALPHA, regs); if (mAlphaTest!=GGL_ALWAYS) { // only handle the z-write part here. We know z-test // was successful, as well as alpha-test. build_depth_test(parts, Z_WRITE); } build_component(pixel, parts, GGLFormat::RED, regs); build_component(pixel, parts, GGLFormat::GREEN, regs); build_component(pixel, parts, GGLFormat::BLUE, regs); pixel.flags |= CORRUPTIBLE; } if (registerFile().status()) return registerFile().status(); if (pixel.reg == -1) { // be defensive here. if we're here it's probably // that this whole fragment is a no-op. pixel = mDstPixel; } if (!mAllMasked) { // logic operation build_logic_op(pixel, regs); // masking build_masking(pixel, regs); comment("store"); store(parts.cbPtr, pixel, WRITE_BACK); } } if (registerFile().status()) return registerFile().status(); // update the iterated color... if (parts.reload != 3) { build_smooth_shade(parts); } // update iterated z build_iterate_z(parts); // update iterated fog build_iterate_f(parts); SUB(AL, S, parts.count.reg, parts.count.reg, imm(1<<16)); B(PL, "fragment_loop"); label("epilog"); epilog(registerFile().touched()); if ((mAlphaTest!=GGL_ALWAYS) || (mDepthTest!=GGL_ALWAYS)) { if (mDepthTest!=GGL_ALWAYS) { label("discard_before_textures"); build_iterate_texture_coordinates(parts); } label("discard_after_textures"); build_smooth_shade(parts); build_iterate_z(parts); build_iterate_f(parts); if (!mAllMasked) { ADD(AL, 0, parts.cbPtr.reg, parts.cbPtr.reg, imm(parts.cbPtr.size>>3)); } SUB(AL, S, parts.count.reg, parts.count.reg, imm(1<<16)); B(PL, "fragment_loop"); epilog(registerFile().touched()); }
in order to stop gcc from complaining. */ #define EMPTY 0,0,NULL struct ia64_opcode ia64_opcodes_i[] = { /* I-type instruction encodings (sorted according to major opcode). */ {"break.i", I0, OpX3X6 (0, 0, 0x00), {IMMU21}, X_IN_MLX, 0, NULL}, {"nop.i", I0, OpX3X6Yb (0, 0, 0x01, 0), {IMMU21}, X_IN_MLX, 0, NULL}, {"hint.i", I0, OpX3X6Yb (0, 0, 0x01, 1), {IMMU21}, X_IN_MLX, 0, NULL}, {"chk.s.i", I0, OpX3 (0, 1), {R2, TGT25b}, EMPTY}, {"mov", I, OpX3XbIhWhTag13 (0, 7, 0, 0, 1, 0), {B1, R2}, PSEUDO, 0, NULL}, #define MOV(a,b,c,d) \ I, OpX3XbIhWh (0, a, b, c, d), {B1, R2, TAG13b}, EMPTY {"mov.sptk", MOV (7, 0, 0, 0)}, {"mov.sptk.imp", MOV (7, 0, 1, 0)}, {"mov", MOV (7, 0, 0, 1)}, {"mov.imp", MOV (7, 0, 1, 1)}, {"mov.dptk", MOV (7, 0, 0, 2)}, {"mov.dptk.imp", MOV (7, 0, 1, 2)}, {"mov.ret.sptk", MOV (7, 1, 0, 0)}, {"mov.ret.sptk.imp", MOV (7, 1, 1, 0)}, {"mov.ret", MOV (7, 1, 0, 1)}, {"mov.ret.imp", MOV (7, 1, 1, 1)}, {"mov.ret.dptk", MOV (7, 1, 0, 2)}, {"mov.ret.dptk.imp", MOV (7, 1, 1, 2)}, #undef MOV {"mov", I, OpX3X6 (0, 0, 0x31), {R1, B2}, EMPTY}, {"mov", I, OpX3 (0, 3), {PR, R2, IMM17}, EMPTY}, /* Don't remove one of the seemingly redundant FULL17-s. */
void vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) { switch (instr->intrinsic) { case nir_intrinsic_load_invocation_id: emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD), invocation_id)); break; case nir_intrinsic_load_primitive_id: emit(TCS_OPCODE_GET_PRIMITIVE_ID, get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD)); break; case nir_intrinsic_load_patch_vertices_in: emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D), brw_imm_d(key->input_vertices))); break; case nir_intrinsic_load_per_vertex_input: { src_reg indirect_offset = get_indirect_offset(instr); unsigned imm_offset = instr->const_index[0]; nir_const_value *vertex_const = nir_src_as_const_value(instr->src[0]); src_reg vertex_index = vertex_const ? src_reg(brw_imm_ud(vertex_const->u32[0])) : get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1); dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D); dst.writemask = brw_writemask_for_size(instr->num_components); emit_input_urb_read(dst, vertex_index, imm_offset, nir_intrinsic_component(instr), indirect_offset); break; } case nir_intrinsic_load_input: unreachable("nir_lower_io should use load_per_vertex_input intrinsics"); break; case nir_intrinsic_load_output: case nir_intrinsic_load_per_vertex_output: { src_reg indirect_offset = get_indirect_offset(instr); unsigned imm_offset = instr->const_index[0]; dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D); dst.writemask = brw_writemask_for_size(instr->num_components); if (imm_offset == 0 && indirect_offset.file == BAD_FILE) { dst.type = BRW_REGISTER_TYPE_F; /* This is a read of gl_TessLevelInner[], which lives in the * Patch URB header. The layout depends on the domain. */ switch (key->tes_primitive_mode) { case GL_QUADS: { /* DWords 3-2 (reversed); use offset 0 and WZYX swizzle. */ dst_reg tmp(this, glsl_type::vec4_type); emit_output_urb_read(tmp, 0, 0, src_reg()); emit(MOV(writemask(dst, WRITEMASK_XY), swizzle(src_reg(tmp), BRW_SWIZZLE_WZYX))); break; } case GL_TRIANGLES: /* DWord 4; use offset 1 but normal swizzle/writemask. */ emit_output_urb_read(writemask(dst, WRITEMASK_X), 1, 0, src_reg()); break; case GL_ISOLINES: /* All channels are undefined. */ return; default: unreachable("Bogus tessellation domain"); } } else if (imm_offset == 1 && indirect_offset.file == BAD_FILE) { dst.type = BRW_REGISTER_TYPE_F; unsigned swiz = BRW_SWIZZLE_WZYX; /* This is a read of gl_TessLevelOuter[], which lives in the * high 4 DWords of the Patch URB header, in reverse order. */ switch (key->tes_primitive_mode) { case GL_QUADS: dst.writemask = WRITEMASK_XYZW; break; case GL_TRIANGLES: dst.writemask = WRITEMASK_XYZ; break; case GL_ISOLINES: /* Isolines are not reversed; swizzle .zw -> .xy */ swiz = BRW_SWIZZLE_ZWZW; dst.writemask = WRITEMASK_XY; return; default: unreachable("Bogus tessellation domain"); } dst_reg tmp(this, glsl_type::vec4_type); emit_output_urb_read(tmp, 1, 0, src_reg()); emit(MOV(dst, swizzle(src_reg(tmp), swiz))); } else { emit_output_urb_read(dst, imm_offset, nir_intrinsic_component(instr), indirect_offset); } break; } case nir_intrinsic_store_output: case nir_intrinsic_store_per_vertex_output: { src_reg value = get_nir_src(instr->src[0]); unsigned mask = instr->const_index[1]; unsigned swiz = BRW_SWIZZLE_XYZW; src_reg indirect_offset = get_indirect_offset(instr); unsigned imm_offset = instr->const_index[0]; /* The passthrough shader writes the whole patch header as two vec4s; * skip all the gl_TessLevelInner/Outer swizzling. */ if (indirect_offset.file == BAD_FILE && !is_passthrough_shader) { if (imm_offset == 0) { value.type = BRW_REGISTER_TYPE_F; mask &= (1 << tesslevel_inner_components(key->tes_primitive_mode)) - 1; /* This is a write to gl_TessLevelInner[], which lives in the * Patch URB header. The layout depends on the domain. */ switch (key->tes_primitive_mode) { case GL_QUADS: /* gl_TessLevelInner[].xy lives at DWords 3-2 (reversed). * We use an XXYX swizzle to reverse put .xy in the .wz * channels, and use a .zw writemask. */ swiz = BRW_SWIZZLE4(0, 0, 1, 0); mask = writemask_for_backwards_vector(mask); break; case GL_TRIANGLES: /* gl_TessLevelInner[].x lives at DWord 4, so we set the * writemask to X and bump the URB offset by 1. */ imm_offset = 1; break; case GL_ISOLINES: /* Skip; gl_TessLevelInner[] doesn't exist for isolines. */ return; default: unreachable("Bogus tessellation domain"); } } else if (imm_offset == 1) { value.type = BRW_REGISTER_TYPE_F; mask &= (1 << tesslevel_outer_components(key->tes_primitive_mode)) - 1; /* This is a write to gl_TessLevelOuter[] which lives in the * Patch URB Header at DWords 4-7. However, it's reversed, so * instead of .xyzw we have .wzyx. */ if (key->tes_primitive_mode == GL_ISOLINES) { /* Isolines .xy should be stored in .zw, in order. */ swiz = BRW_SWIZZLE4(0, 0, 0, 1); mask <<= 2; } else { /* Other domains are reversed; store .wzyx instead of .xyzw. */ swiz = BRW_SWIZZLE_WZYX; mask = writemask_for_backwards_vector(mask); } } } unsigned first_component = nir_intrinsic_component(instr); if (first_component) { assert(swiz == BRW_SWIZZLE_XYZW); swiz = BRW_SWZ_COMP_OUTPUT(first_component); mask = mask << first_component; } emit_urb_write(swizzle(value, swiz), mask, imm_offset, indirect_offset); break; } case nir_intrinsic_barrier: { dst_reg header = dst_reg(this, glsl_type::uvec4_type); emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header); emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header)); break; } default: vec4_visitor::nir_emit_intrinsic(instr); } }
void Jit64::lfd(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStoreFloatingOff); FALLBACK_IF(js.memcheck || !inst.RA); int d = inst.RD; int a = inst.RA; s32 offset = (s32)(s16)inst.SIMM_16; gpr.FlushLockX(ABI_PARAM1); gpr.Lock(a); MOV(32, R(ABI_PARAM1), gpr.R(a)); // TODO - optimize. This has to load the previous value - upper double should stay unmodified. fpr.Lock(d); fpr.BindToRegister(d, true); X64Reg xd = fpr.RX(d); if (cpu_info.bSSSE3) { #if _M_X86_64 MOVQ_xmm(XMM0, MComplex(RBX, ABI_PARAM1, SCALE_1, offset)); #else AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK)); MOVQ_xmm(XMM0, MDisp(ABI_PARAM1, (u32)Memory::base + offset)); #endif PSHUFB(XMM0, M((void *)bswapShuffle1x8Dupe)); MOVSD(xd, R(XMM0)); } else { #if _M_X86_64 LoadAndSwap(64, EAX, MComplex(RBX, ABI_PARAM1, SCALE_1, offset)); MOV(64, M(&temp64), R(EAX)); MEMCHECK_START MOVSD(XMM0, M(&temp64)); MOVSD(xd, R(XMM0)); MEMCHECK_END #else AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK)); MOV(32, R(EAX), MDisp(ABI_PARAM1, (u32)Memory::base + offset)); BSWAP(32, EAX); MOV(32, M((void*)((u8 *)&temp64+4)), R(EAX)); MEMCHECK_START MOV(32, R(EAX), MDisp(ABI_PARAM1, (u32)Memory::base + offset + 4)); BSWAP(32, EAX); MOV(32, M(&temp64), R(EAX)); MOVSD(XMM0, M(&temp64)); MOVSD(xd, R(XMM0)); MEMCHECK_END #endif } gpr.UnlockAll(); gpr.UnlockAllX(); fpr.UnlockAll(); }
void Jit::Comp_mxc1(MIPSOpcode op) { CONDITIONAL_DISABLE; int fs = _FS; MIPSGPReg rt = _RT; switch ((op >> 21) & 0x1f) { case 0: // R(rt) = FI(fs); break; //mfc1 if (rt == MIPS_REG_ZERO) return; gpr.MapReg(rt, false, true); // If fs is not mapped, most likely it's being abandoned. // Just load from memory in that case. if (fpr.R(fs).IsSimpleReg()) { MOVD_xmm(gpr.R(rt), fpr.RX(fs)); } else { MOV(32, gpr.R(rt), fpr.R(fs)); } break; case 2: // R(rt) = currentMIPS->ReadFCR(fs); break; //cfc1 if (rt == MIPS_REG_ZERO) return; if (fs == 31) { bool wasImm = gpr.IsImm(MIPS_REG_FPCOND); if (!wasImm) { gpr.Lock(rt, MIPS_REG_FPCOND); gpr.MapReg(MIPS_REG_FPCOND, true, false); } gpr.MapReg(rt, false, true); MOV(32, gpr.R(rt), M(&mips_->fcr31)); if (wasImm) { if (gpr.GetImm(MIPS_REG_FPCOND) & 1) { OR(32, gpr.R(rt), Imm32(1 << 23)); } else { AND(32, gpr.R(rt), Imm32(~(1 << 23))); } } else { AND(32, gpr.R(rt), Imm32(~(1 << 23))); MOV(32, R(TEMPREG), gpr.R(MIPS_REG_FPCOND)); AND(32, R(TEMPREG), Imm32(1)); SHL(32, R(TEMPREG), Imm8(23)); OR(32, gpr.R(rt), R(TEMPREG)); } gpr.UnlockAll(); } else if (fs == 0) { gpr.SetImm(rt, MIPSState::FCR0_VALUE); } else { Comp_Generic(op); } return; case 4: //FI(fs) = R(rt); break; //mtc1 fpr.MapReg(fs, false, true); if (gpr.IsImm(rt) && gpr.GetImm(rt) == 0) { XORPS(fpr.RX(fs), fpr.R(fs)); } else { gpr.KillImmediate(rt, true, false); MOVD_xmm(fpr.RX(fs), gpr.R(rt)); } return; case 6: //currentMIPS->WriteFCR(fs, R(rt)); break; //ctc1 if (fs == 31) { // Must clear before setting, since ApplyRoundingMode() assumes it was cleared. RestoreRoundingMode(); if (gpr.IsImm(rt)) { gpr.SetImm(MIPS_REG_FPCOND, (gpr.GetImm(rt) >> 23) & 1); MOV(32, M(&mips_->fcr31), Imm32(gpr.GetImm(rt) & 0x0181FFFF)); if ((gpr.GetImm(rt) & 0x1000003) == 0) { // Default nearest / no-flush mode, just leave it cleared. } else { UpdateRoundingMode(); ApplyRoundingMode(); } } else {
void gen6_gs_visitor::visit(ir_emit_vertex *) { this->current_annotation = "gen6 emit vertex"; /* Honor max_vertex layout indication in geometry shader by ignoring any * vertices coming after c->gp->program.VerticesOut. */ unsigned num_output_vertices = c->gp->program.VerticesOut; emit(CMP(dst_null_d(), this->vertex_count, src_reg(num_output_vertices), BRW_CONDITIONAL_L)); emit(IF(BRW_PREDICATE_NORMAL)); { /* Buffer all output slots for this vertex in vertex_output */ for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) { int varying = prog_data->vue_map.slot_to_varying[slot]; if (varying != VARYING_SLOT_PSIZ) { dst_reg dst(this->vertex_output); dst.reladdr = ralloc(mem_ctx, src_reg); memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg)); emit_urb_slot(dst, varying); } else { /* The PSIZ slot can pack multiple varyings in different channels * and emit_urb_slot() will produce a MOV instruction for each of * them. Since we are writing to an array, that will translate to * possibly multiple MOV instructions with an array destination and * each will generate a scratch write with the same offset into * scratch space (thus, each one overwriting the previous). This is * not what we want. What we will do instead is emit PSIZ to a * a regular temporary register, then move that resgister into the * array. This way we only have one instruction with an array * destination and we only produce a single scratch write. */ dst_reg tmp = dst_reg(src_reg(this, glsl_type::uvec4_type)); emit_urb_slot(tmp, varying); dst_reg dst(this->vertex_output); dst.reladdr = ralloc(mem_ctx, src_reg); memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg)); vec4_instruction *inst = emit(MOV(dst, src_reg(tmp))); inst->force_writemask_all = true; } emit(ADD(dst_reg(this->vertex_output_offset), this->vertex_output_offset, 1u)); } /* Now buffer flags for this vertex */ dst_reg dst(this->vertex_output); dst.reladdr = ralloc(mem_ctx, src_reg); memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg)); if (c->gp->program.OutputType == GL_POINTS) { /* If we are outputting points, then every vertex has PrimStart and * PrimEnd set. */ emit(MOV(dst, (_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) | URB_WRITE_PRIM_START | URB_WRITE_PRIM_END)); emit(ADD(dst_reg(this->prim_count), this->prim_count, 1u)); } else { /* Otherwise, we can only set the PrimStart flag, which we have stored * in the first_vertex register. We will have to wait until we execute * EndPrimitive() or we end the thread to set the PrimEnd flag on a * vertex. */ emit(OR(dst, this->first_vertex, (c->prog_data.output_topology << URB_WRITE_PRIM_TYPE_SHIFT))); emit(MOV(dst_reg(this->first_vertex), 0u)); } emit(ADD(dst_reg(this->vertex_output_offset), this->vertex_output_offset, 1u)); /* Update vertex count */ emit(ADD(dst_reg(this->vertex_count), this->vertex_count, 1u)); } emit(BRW_OPCODE_ENDIF); }