void MIPSEmitter::J(const void *func, std::function<void ()> delaySlot) { SetJumpTarget(J(delaySlot), func); }
void Jit::Comp_FPULS(u32 op) { CONDITIONAL_DISABLE; s32 offset = (s16)(op & 0xFFFF); int ft = _FT; int rs = _RS; // u32 addr = R(rs) + offset; // logBlocks = 1; bool doCheck = false; switch(op >> 26) { case 49: //FI(ft) = Memory::Read_U32(addr); break; //lwc1 fpr.MapReg(ft, MAP_NOINIT | MAP_DIRTY); if (gpr.IsImm(rs)) { u32 addr = (offset + gpr.GetImm(rs)) & 0x3FFFFFFF; MOVI2R(R0, addr + (u32)Memory::base); } else { gpr.MapReg(rs); if (g_Config.bFastMemory) { SetR0ToEffectiveAddress(rs, offset); } else { SetCCAndR0ForSafeAddress(rs, offset, R1); doCheck = true; } ADD(R0, R0, R11); } #ifdef __ARM_ARCH_7S__ FixupBranch skip; if (doCheck) { skip = B_CC(CC_EQ); } VLDR(fpr.R(ft), R0, 0); if (doCheck) { SetJumpTarget(skip); SetCC(CC_AL); } #else VLDR(fpr.R(ft), R0, 0); if (doCheck) { SetCC(CC_EQ); MOVI2R(R0, 0); VMOV(fpr.R(ft), R0); SetCC(CC_AL); } #endif break; case 57: //Memory::Write_U32(FI(ft), addr); break; //swc1 fpr.MapReg(ft); if (gpr.IsImm(rs)) { u32 addr = (offset + gpr.GetImm(rs)) & 0x3FFFFFFF; MOVI2R(R0, addr + (u32)Memory::base); } else { gpr.MapReg(rs); if (g_Config.bFastMemory) { SetR0ToEffectiveAddress(rs, offset); } else { SetCCAndR0ForSafeAddress(rs, offset, R1); doCheck = true; } ADD(R0, R0, R11); } #ifdef __ARM_ARCH_7S__ FixupBranch skip2; if (doCheck) { skip2 = B_CC(CC_EQ); } VSTR(fpr.R(ft), R0, 0); if (doCheck) { SetJumpTarget(skip2); SetCC(CC_AL); } #else VSTR(fpr.R(ft), R0, 0); if (doCheck) { SetCC(CC_AL); } #endif break; default: Comp_Generic(op); return; } }
void MIPSEmitter::BGTZ(MIPSReg rs, const void *func, std::function<void ()> delaySlot) { SetJumpTarget(BGTZ(rs, delaySlot), func); }
void MIPSEmitter::SetJumpTarget(const FixupBranch &branch) { SetJumpTarget(branch, code_); }
void JitArm::lXX(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStoreOff); u32 a = inst.RA, b = inst.RB, d = inst.RD; s32 offset = inst.SIMM_16; u32 accessSize = 0; s32 offsetReg = -1; bool update = false; bool signExtend = false; bool reverse = false; bool fastmem = false; switch (inst.OPCD) { case 31: switch (inst.SUBOP10) { case 55: // lwzux update = true; case 23: // lwzx fastmem = true; accessSize = 32; offsetReg = b; break; case 119: //lbzux update = true; case 87: // lbzx fastmem = true; accessSize = 8; offsetReg = b; break; case 311: // lhzux update = true; case 279: // lhzx fastmem = true; accessSize = 16; offsetReg = b; break; case 375: // lhaux update = true; case 343: // lhax accessSize = 16; signExtend = true; offsetReg = b; break; case 534: // lwbrx accessSize = 32; reverse = true; break; case 790: // lhbrx accessSize = 16; reverse = true; break; } break; case 33: // lwzu update = true; case 32: // lwz fastmem = true; accessSize = 32; break; case 35: // lbzu update = true; case 34: // lbz fastmem = true; accessSize = 8; break; case 41: // lhzu update = true; case 40: // lhz fastmem = true; accessSize = 16; break; case 43: // lhau update = true; case 42: // lha signExtend = true; accessSize = 16; break; } // Check for exception before loading ARMReg rA = gpr.GetReg(false); LDR(rA, R9, PPCSTATE_OFF(Exceptions)); TST(rA, EXCEPTION_DSI); FixupBranch DoNotLoad = B_CC(CC_NEQ); SafeLoadToReg(fastmem, d, update ? a : (a ? a : -1), offsetReg, accessSize, offset, signExtend, reverse); if (update) { ARMReg RA = gpr.R(a); if (offsetReg == -1) { rA = gpr.GetReg(false); MOVI2R(rA, offset); ADD(RA, RA, rA); } else { ADD(RA, RA, gpr.R(offsetReg)); } } SetJumpTarget(DoNotLoad); // LWZ idle skipping if (SConfig::GetInstance().m_LocalCoreStartupParameter.bSkipIdle && inst.OPCD == 32 && (inst.hex & 0xFFFF0000) == 0x800D0000 && (Memory::ReadUnchecked_U32(js.compilerPC + 4) == 0x28000000 || (SConfig::GetInstance().m_LocalCoreStartupParameter.bWii && Memory::ReadUnchecked_U32(js.compilerPC + 4) == 0x2C000000)) && Memory::ReadUnchecked_U32(js.compilerPC + 8) == 0x4182fff8) { ARMReg RD = gpr.R(d); // if it's still 0, we can wait until the next event TST(RD, RD); FixupBranch noIdle = B_CC(CC_NEQ); gpr.Flush(FLUSH_MAINTAIN_STATE); fpr.Flush(FLUSH_MAINTAIN_STATE); rA = gpr.GetReg(); MOVI2R(rA, (u32)&PowerPC::OnIdle); MOVI2R(R0, PowerPC::ppcState.gpr[a] + (s32)(s16)inst.SIMM_16); BL(rA); gpr.Unlock(rA); WriteExceptionExit(); SetJumpTarget(noIdle); //js.compilerPC += 8; return; } }
void JitArm::ps_cmpo1(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITFloatingPointOff); u32 a = inst.FA, b = inst.FB; int cr = inst.CRFD; ARMReg vA = fpr.R1(a); ARMReg vB = fpr.R1(b); ARMReg fpscrReg = gpr.GetReg(); ARMReg crReg = gpr.GetReg(); Operand2 FPRFMask(0x1F, 0xA); // 0x1F000 Operand2 LessThan(0x8, 0xA); // 0x8000 Operand2 GreaterThan(0x4, 0xA); // 0x4000 Operand2 EqualTo(0x2, 0xA); // 0x2000 Operand2 NANRes(0x1, 0xA); // 0x1000 FixupBranch Done1, Done2, Done3; LDR(fpscrReg, R9, PPCSTATE_OFF(fpscr)); BIC(fpscrReg, fpscrReg, FPRFMask); VCMPE(vA, vB); VMRS(_PC); SetCC(CC_LT); ORR(fpscrReg, fpscrReg, LessThan); MOV(crReg, 8); Done1 = B(); SetCC(CC_GT); ORR(fpscrReg, fpscrReg, GreaterThan); MOV(crReg, 4); Done2 = B(); SetCC(CC_EQ); ORR(fpscrReg, fpscrReg, EqualTo); MOV(crReg, 2); Done3 = B(); SetCC(); ORR(fpscrReg, fpscrReg, NANRes); MOV(crReg, 1); VCMPE(vA, vA); VMRS(_PC); FixupBranch NanA = B_CC(CC_NEQ); VCMPE(vB, vB); VMRS(_PC); FixupBranch NanB = B_CC(CC_NEQ); SetFPException(fpscrReg, FPSCR_VXVC); FixupBranch Done4 = B(); SetJumpTarget(NanA); SetJumpTarget(NanB); SetFPException(fpscrReg, FPSCR_VXSNAN); TST(fpscrReg, VEMask); FixupBranch noVXVC = B_CC(CC_NEQ); SetFPException(fpscrReg, FPSCR_VXVC); SetJumpTarget(noVXVC); SetJumpTarget(Done1); SetJumpTarget(Done2); SetJumpTarget(Done3); SetJumpTarget(Done4); STRB(crReg, R9, PPCSTATE_OFF(cr_fast) + cr); STR(fpscrReg, R9, PPCSTATE_OFF(fpscr)); gpr.Unlock(fpscrReg, crReg); }
// Produce optimized form of timesRepeat: [...]. // Returns true if optimization performed. // Note that we perform the conditional jump as a backwards jump at the end for optimal performance bool Compiler::ParseTimesRepeatLoop(const TEXTRANGE& messageRange) { POTE oteSelector = AddSymbolToFrame("timesRepeat:", messageRange); if (!ThisTokenIsBinary('[')) { Warning(messageRange, CWarnExpectNiladicBlockArg, (Oop)oteSelector); return false; } // Can apply extra optimizations if receiver is known SmallInteger int loopTimes=0; bool isIntReceiver = LastIsPushSmallInteger(loopTimes); int startMark = GenDup(); int jumpOver = GenJumpInstruction(LongJump); int loopHead = m_codePointer; PushOptimizedScope(); // Parse the loop block and ignore its result ParseOptimizeBlock(0); GenPopStack(); PopOptimizedScope(ThisTokenRange().m_stop); NextToken(); if (isIntReceiver && loopTimes <= 0) { // Blank out all our bytecodes if we have 0 or less loops const int loopEnd = m_codePointer; for (int p = startMark; p < loopEnd; p++) UngenInstruction(p); return true; } // Decrement counter GenInstruction(DecrementStackTop); // Dup counter for compare int testMark = GenDup(); // Fill in forward unconditional jump before the head of the loop // which jumps to the conditional test SetJumpTarget(jumpOver, testMark); if (isIntReceiver) { // Using IsZero speeds up empty loop by about 5%. _ASSERTE(loopTimes > 0); GenInstruction(SpecialSendIsZero); } else { // Test for another run around the wheel - favour use of #<, therefore need to test against 1 rather than 0 GenInteger(1, TEXTRANGE()); // No breakpoint wanted here //GenMessage("<", 1); GenInstruction(SendArithmeticLT); } // Conditional jump back to loop head GenJump(LongJumpIfFalse, loopHead); // Pop off the loop counter, leaving the integer receiver on the stack GenPopStack(); return true; }
void JitArm64::twx(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITSystemRegistersOff); s32 a = inst.RA; ARM64Reg WA = gpr.GetReg(); if (inst.OPCD == 3) // twi { CMPI2R(gpr.R(a), (s32)(s16)inst.SIMM_16, WA); } else // tw { CMP(gpr.R(a), gpr.R(inst.RB)); } std::vector<FixupBranch> fixups; CCFlags conditions[] = {CC_LT, CC_GT, CC_EQ, CC_VC, CC_VS}; for (int i = 0; i < 5; i++) { if (inst.TO & (1 << i)) { FixupBranch f = B(conditions[i]); fixups.push_back(f); } } FixupBranch dont_trap = B(); for (const FixupBranch& fixup : fixups) { SetJumpTarget(fixup); } FixupBranch far = B(); SwitchToFarCode(); SetJumpTarget(far); gpr.Flush(FlushMode::FLUSH_MAINTAIN_STATE); fpr.Flush(FlushMode::FLUSH_MAINTAIN_STATE); LDR(INDEX_UNSIGNED, WA, PPC_REG, PPCSTATE_OFF(Exceptions)); ORR(WA, WA, 24, 0); // Same as WA | EXCEPTION_PROGRAM STR(INDEX_UNSIGNED, WA, PPC_REG, PPCSTATE_OFF(Exceptions)); gpr.Unlock(WA); WriteExceptionExit(js.compilerPC); SwitchToNearCode(); SetJumpTarget(dont_trap); if (!analyzer.HasOption(PPCAnalyst::PPCAnalyzer::OPTION_CONDITIONAL_CONTINUE)) { gpr.Flush(FlushMode::FLUSH_ALL); fpr.Flush(FlushMode::FLUSH_ALL); WriteExit(js.compilerPC + 4); } }
bool Compiler::ParseIfNotNil(const TEXTRANGE& messageRange, int exprStartPos) { if (!ThisTokenIsBinary('[')) { Warning(CWarnExpectMonadicOrNiladicBlockArg, (Oop)InternSymbol("ifNotNil:")); return false; } int dupMark = GenDup(); BreakPoint(); const int sendIsNil = GenInstruction(SpecialSendIsNil); AddTextMap(sendIsNil, exprStartPos, LastTokenRange().m_stop); // We're going to add a pop and jump on condition instruction here // Its a forward jump, so we need to patch up the target later const int popAndJumpMark = GenJumpInstruction(LongJumpIfTrue); int argc = ParseIfNotNilBlock(); // If the ifNotNil: block did not have any arguments, then we do not need the Dup, and we also // need to patch out the corresponding pop. if (!argc) { UngenInstruction(dupMark); UngenInstruction(popAndJumpMark + lengthOfByteCode(LongJumpIfTrue)); } int ifNilMark; // Has an #ifNil: branch? if (strcmp(ThisTokenText(), "ifNil:") == 0) { POTE oteSelector = AddSymbolToFrame("ifNotNil:ifNil:", messageRange); // Generate the jump out instruction (forward jump, so target not yet known) int jumpOutMark = GenJumpInstruction(LongJump); // Mark first instruction of the "else" branch ifNilMark = m_codePointer; // ifNotNil:ifNil: form NextToken(); ParseIfNilBlock(!argc); SetJumpTarget(jumpOutMark, GenNop()); } else { // No "ifNil:" branch POTE oteSelector = AddSymbolToFrame("ifNotNil:", messageRange); if (!argc) { // Since we've removed the Dup if the ifNotNil: block had no args, we need to ensure there is nil atop the stack // This should normally get optimized away later if the expression value is not used. // Generate the jump out instruction int jumpOutMark = GenJumpInstruction(LongJump); ifNilMark = GenInstruction(ShortPushNil); SetJumpTarget(jumpOutMark, GenNop()); } else { ifNilMark = GenNop(); } } SetJumpTarget(popAndJumpMark, ifNilMark); return true; }
void Compiler::ParseToByNumberDo(int toPointer, Oop oopNumber, bool bNegativeStep) { PushOptimizedScope(); // Add an "each". This should be at the top of the temporary stack as it is taken over by the do: block TempVarRef* pEachTempRef = AddOptimizedTemp(eachTempName); // Start to generate bytecodes // First we must store that from value into our 'each' counter variable/argument. This involves // stepping back a bit ... _ASSERTE(toPointer < m_codePointer); int currentPos = m_codePointer; m_codePointer = toPointer; // Note we store so leaving the value on the stack as the result of the whole expression GenStoreTemp(pEachTempRef); m_codePointer = currentPos + m_bytecodes[toPointer].instructionLength(); // Dup the to: value GenDup(); // And push the from value GenPushTemp(pEachTempRef); // We must jump over the block to the test first time through int jumpOver = GenJumpInstruction(LongJump); int loopHead = m_codePointer; // Parse the one argument block. // Leave nothing on the stack, expect 1 argument ParseOptimizeBlock(1); // Pop off the result of the optimized block GenPopStack(); GenDup(); GenPushTemp(pEachTempRef); // If the step is 1/-1, this will be optimised down to Increment/Decrement GenNumber(oopNumber, LastTokenRange()); int add = GenInstruction(SendArithmeticAdd); int store = GenStoreTemp(pEachTempRef); int comparePointer = m_codePointer; if (bNegativeStep) GenInstruction(SendArithmeticGT); else // Note that < is the best message to send, since it is normally directly implemented GenInstruction(SendArithmeticLT); GenJump(LongJumpIfFalse, loopHead); // Pop the to: value GenPopStack(); SetJumpTarget(jumpOver, comparePointer); TODO("Is this in the right place? What is the last real IP of the loop"); PopOptimizedScope(ThisTokenRange().m_stop); NextToken(); }
bool Compiler::ParseIfNil(const TEXTRANGE& messageRange, int exprStartPos) { if (!ThisTokenIsBinary('[')) { Warning(CWarnExpectNiladicBlockArg, (Oop)InternSymbol("ifNil:")); return false; } int dupMark = GenDup(); BreakPoint(); const int sendIsNil = GenInstruction(SpecialSendIsNil); const int mapEntry = AddTextMap(sendIsNil, exprStartPos, LastTokenRange().m_stop); // We're going to add a pop and jump on condition instruction here // Its a forward jump, so we need to patch up the target later const int popAndJumpMark = GenJumpInstruction(LongJumpIfFalse); ParseIfNilBlock(false); int ifNotNilMark; if (strcmp(ThisTokenText(), "ifNotNil:") == 0) { POTE oteSelector = AddSymbolToFrame("ifNil:ifNotNil:", messageRange); // Generate the jump out instruction (forward jump, so target not yet known) int jumpOutMark = GenJumpInstruction(LongJump); // Mark first instruction of the "else" branch ifNotNilMark = m_codePointer; // #ifNil:ifNotNil: form NextToken(); int argc = ParseIfNotNilBlock(); // Be careful, may not actually be a literal block there if (m_ok) { // If the ifNotNil: block does not need an argument, we can patch out the Dup // and corresponding pop if (!argc) { UngenInstruction(dupMark); _ASSERTE(m_bytecodes[popAndJumpMark + lengthOfByteCode(LongJumpIfFalse)].byte == PopStackTop); UngenInstruction(popAndJumpMark + lengthOfByteCode(LongJumpIfFalse)); _ASSERTE(m_bytecodes[ifNotNilMark].byte == PopStackTop); UngenInstruction(ifNotNilMark); } // Set unconditional jump at the end of the ifNil to jump over the "else" branch to a Nop SetJumpTarget(jumpOutMark, GenNop()); } } else { POTE oteSelector = AddSymbolToFrame("ifNil:", messageRange); // No "else" branch, but we still need an instruction to jump to ifNotNilMark = GenNop(); } if (m_ok) { // Conditional jump to the "else" branch (or the Nop if no else branch) SetJumpTarget(popAndJumpMark, ifNotNilMark); if (mapEntry >= 0) m_textMaps[mapEntry].stop = LastTokenRange().m_stop; } return true; }
LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) { _assert_msg_(G3D, id.linear, "Linear should be set on sampler id"); BeginWrite(); // We'll first write the nearest sampler, which we will CALL. // This may differ slightly based on the "linear" flag. const u8 *nearest = AlignCode16(); if (!Jit_ReadTextureFormat(id)) { EndWrite(); SetCodePtr(const_cast<u8 *>(nearest)); return nullptr; } RET(); // Now the actual linear func, which is exposed externally. const u8 *start = AlignCode16(); // NOTE: This doesn't use the general register mapping. // POSIX: arg1=uptr, arg2=vptr, arg3=frac_u, arg4=frac_v, arg5=src, arg6=bufw, stack+8=level // Win64: arg1=uptr, arg2=vptr, arg3=frac_u, arg4=frac_v, stack+40=src, stack+48=bufw, stack+56=level // // We map these to nearest CALLs, with order: u, v, src, bufw, level // Let's start by saving a bunch of registers. PUSH(R15); PUSH(R14); PUSH(R13); PUSH(R12); // Won't need frac_u/frac_v for a while. PUSH(arg4Reg); PUSH(arg3Reg); // Extra space to restore alignment and save resultReg for lerp. // TODO: Maybe use XMMs instead? SUB(64, R(RSP), Imm8(24)); MOV(64, R(R12), R(arg1Reg)); MOV(64, R(R13), R(arg2Reg)); #ifdef _WIN32 // First arg now starts at 24 (extra space) + 48 (pushed stack) + 8 (ret address) + 32 (shadow space) const int argOffset = 24 + 48 + 8 + 32; MOV(64, R(R14), MDisp(RSP, argOffset)); MOV(32, R(R15), MDisp(RSP, argOffset + 8)); // level is at argOffset + 16. #else MOV(64, R(R14), R(arg5Reg)); MOV(32, R(R15), R(arg6Reg)); // level is at 24 + 48 + 8. #endif // Early exit on !srcPtr. FixupBranch zeroSrc; if (id.hasInvalidPtr) { CMP(PTRBITS, R(R14), Imm8(0)); FixupBranch nonZeroSrc = J_CC(CC_NZ); XOR(32, R(RAX), R(RAX)); zeroSrc = J(true); SetJumpTarget(nonZeroSrc); } // At this point: // R12=uptr, R13=vptr, stack+24=frac_u, stack+32=frac_v, R14=src, R15=bufw, stack+X=level auto doNearestCall = [&](int off) { MOV(32, R(uReg), MDisp(R12, off)); MOV(32, R(vReg), MDisp(R13, off)); MOV(64, R(srcReg), R(R14)); MOV(32, R(bufwReg), R(R15)); // Leave level, we just always load from RAM. Separate CLUTs is uncommon. CALL(nearest); MOV(32, MDisp(RSP, off), R(resultReg)); }; doNearestCall(0); doNearestCall(4); doNearestCall(8); doNearestCall(12); // Convert TL, TR, BL, BR to floats for easier blending. if (!cpu_info.bSSE4_1) { PXOR(XMM0, R(XMM0)); } MOVD_xmm(fpScratchReg1, MDisp(RSP, 0)); MOVD_xmm(fpScratchReg2, MDisp(RSP, 4)); MOVD_xmm(fpScratchReg3, MDisp(RSP, 8)); MOVD_xmm(fpScratchReg4, MDisp(RSP, 12)); if (cpu_info.bSSE4_1) { PMOVZXBD(fpScratchReg1, R(fpScratchReg1)); PMOVZXBD(fpScratchReg2, R(fpScratchReg2)); PMOVZXBD(fpScratchReg3, R(fpScratchReg3)); PMOVZXBD(fpScratchReg4, R(fpScratchReg4)); } else { PUNPCKLBW(fpScratchReg1, R(XMM0)); PUNPCKLBW(fpScratchReg2, R(XMM0)); PUNPCKLBW(fpScratchReg3, R(XMM0)); PUNPCKLBW(fpScratchReg4, R(XMM0)); PUNPCKLWD(fpScratchReg1, R(XMM0)); PUNPCKLWD(fpScratchReg2, R(XMM0)); PUNPCKLWD(fpScratchReg3, R(XMM0)); PUNPCKLWD(fpScratchReg4, R(XMM0)); } CVTDQ2PS(fpScratchReg1, R(fpScratchReg1)); CVTDQ2PS(fpScratchReg2, R(fpScratchReg2)); CVTDQ2PS(fpScratchReg3, R(fpScratchReg3)); CVTDQ2PS(fpScratchReg4, R(fpScratchReg4)); // Okay, now multiply the R sides by frac_u, and L by (256 - frac_u)... MOVD_xmm(fpScratchReg5, MDisp(RSP, 24)); CVTDQ2PS(fpScratchReg5, R(fpScratchReg5)); SHUFPS(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0)); if (RipAccessible(by256)) { MULPS(fpScratchReg5, M(by256)); // rip accessible } else { Crash(); // TODO } MOVAPS(XMM0, M(ones)); SUBPS(XMM0, R(fpScratchReg5)); MULPS(fpScratchReg1, R(XMM0)); MULPS(fpScratchReg2, R(fpScratchReg5)); MULPS(fpScratchReg3, R(XMM0)); MULPS(fpScratchReg4, R(fpScratchReg5)); // Now set top=fpScratchReg1, bottom=fpScratchReg3. ADDPS(fpScratchReg1, R(fpScratchReg2)); ADDPS(fpScratchReg3, R(fpScratchReg4)); // Next, time for frac_v. MOVD_xmm(fpScratchReg5, MDisp(RSP, 32)); CVTDQ2PS(fpScratchReg5, R(fpScratchReg5)); SHUFPS(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0)); MULPS(fpScratchReg5, M(by256)); MOVAPS(XMM0, M(ones)); SUBPS(XMM0, R(fpScratchReg5)); MULPS(fpScratchReg1, R(XMM0)); MULPS(fpScratchReg3, R(fpScratchReg5)); // Still at the 255 scale, now we're interpolated. ADDPS(fpScratchReg1, R(fpScratchReg3)); // Time to convert back to a single 32 bit value. CVTPS2DQ(fpScratchReg1, R(fpScratchReg1)); PACKSSDW(fpScratchReg1, R(fpScratchReg1)); PACKUSWB(fpScratchReg1, R(fpScratchReg1)); MOVD_xmm(R(resultReg), fpScratchReg1); if (id.hasInvalidPtr) { SetJumpTarget(zeroSrc); } ADD(64, R(RSP), Imm8(24)); POP(arg3Reg); POP(arg4Reg); POP(R12); POP(R13); POP(R14); POP(R15); RET(); EndWrite(); return (LinearFunc)start; }
void Jit64::stfd(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStoreFloatingOff); FALLBACK_IF(js.memcheck || !inst.RA); int s = inst.RS; int a = inst.RA; u32 mem_mask = Memory::ADDR_MASK_HW_ACCESS; if (Core::g_CoreStartupParameter.bMMU || Core::g_CoreStartupParameter.bTLBHack) { mem_mask |= Memory::ADDR_MASK_MEM1; } #ifdef ENABLE_MEM_CHECK if (Core::g_CoreStartupParameter.bEnableDebugging) { mem_mask |= Memory::EXRAM_MASK; } #endif gpr.FlushLockX(ABI_PARAM1); gpr.Lock(a); fpr.Lock(s); gpr.BindToRegister(a, true, false); s32 offset = (s32)(s16)inst.SIMM_16; LEA(32, ABI_PARAM1, MDisp(gpr.R(a).GetSimpleReg(), offset)); TEST(32, R(ABI_PARAM1), Imm32(mem_mask)); FixupBranch safe = J_CC(CC_NZ); // Fast routine if (cpu_info.bSSSE3) { MOVAPD(XMM0, fpr.R(s)); PSHUFB(XMM0, M((void*)bswapShuffle1x8)); #if _M_X86_64 MOVQ_xmm(MComplex(RBX, ABI_PARAM1, SCALE_1, 0), XMM0); #else AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK)); MOVQ_xmm(MDisp(ABI_PARAM1, (u32)Memory::base), XMM0); #endif } else { MOVAPD(XMM0, fpr.R(s)); MOVD_xmm(R(EAX), XMM0); UnsafeWriteRegToReg(EAX, ABI_PARAM1, 32, 4); PSRLQ(XMM0, 32); MOVD_xmm(R(EAX), XMM0); UnsafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0); } FixupBranch exit = J(true); SetJumpTarget(safe); // Safe but slow routine MOVAPD(XMM0, fpr.R(s)); PSRLQ(XMM0, 32); MOVD_xmm(R(EAX), XMM0); SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0, RegistersInUse() | (1 << (16 + XMM0))); MOVAPD(XMM0, fpr.R(s)); MOVD_xmm(R(EAX), XMM0); LEA(32, ABI_PARAM1, MDisp(gpr.R(a).GetSimpleReg(), offset)); SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 4, RegistersInUse()); SetJumpTarget(exit); gpr.UnlockAll(); gpr.UnlockAllX(); fpr.UnlockAll(); }
void JitArm::lfXX(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStoreFloatingOff) ARMReg rA = gpr.GetReg(); ARMReg rB = gpr.GetReg(); ARMReg RA; u32 a = inst.RA, b = inst.RB; s32 offset = inst.SIMM_16; bool single = false; bool update = false; bool zeroA = false; s32 offsetReg = -1; switch (inst.OPCD) { case 31: switch(inst.SUBOP10) { case 567: // lfsux single = true; update = true; offsetReg = b; break; case 535: // lfsx single = true; zeroA = true; offsetReg = b; break; case 631: // lfdux update = true; offsetReg = b; break; case 599: // lfdx zeroA = true; offsetReg = b; break; } break; case 49: // lfsu update = true; single = true; break; case 48: // lfs single = true; zeroA = true; break; case 51: // lfdu update = true; break; case 50: // lfd zeroA = true; break; } ARMReg v0 = fpr.R0(inst.FD), v1; if (single) v1 = fpr.R1(inst.FD); if (update) { RA = gpr.R(a); // Update path /always/ uses RA if (offsetReg == -1) // uses SIMM_16 { MOVI2R(rB, offset); ADD(rB, rB, RA); } else ADD(rB, gpr.R(offsetReg), RA); } else { if (zeroA) { if (offsetReg == -1) { if (a) { RA = gpr.R(a); MOVI2R(rB, offset); ADD(rB, rB, RA); } else MOVI2R(rB, (u32)offset); } else { ARMReg RB = gpr.R(offsetReg); if (a) { RA = gpr.R(a); ADD(rB, RB, RA); } else MOV(rB, RB); } } } LDR(rA, R9, PPCSTATE_OFF(Exceptions)); CMP(rA, EXCEPTION_DSI); FixupBranch DoNotLoad = B_CC(CC_EQ); if (update) MOV(RA, rB); if (Core::g_CoreStartupParameter.bFastmem) { Operand2 mask(3, 1); // ~(Memory::MEMVIEW32_MASK) BIC(rB, rB, mask); // 1 MOVI2R(rA, (u32)Memory::base, false); // 2-3 ADD(rB, rB, rA); // 4 NEONXEmitter nemit(this); if (single) { VLDR(S0, rB, 0); nemit.VREV32(I_8, D0, D0); // Byte swap to result VCVT(v0, S0, 0); VCVT(v1, S0, 0); } else { VLDR(v0, rB, 0); nemit.VREV64(I_8, v0, v0); // Byte swap to result } } else { PUSH(4, R0, R1, R2, R3); MOV(R0, rB); if (single) { MOVI2R(rA, (u32)&Memory::Read_U32); BL(rA); VMOV(S0, R0); VCVT(v0, S0, 0); VCVT(v1, S0, 0); } else { MOVI2R(rA, (u32)&Memory::Read_F64); BL(rA); #if !defined(__ARM_PCS_VFP) // SoftFP returns in R0 and R1 VMOV(v0, R0); #else VMOV(v0, D0); #endif } POP(4, R0, R1, R2, R3); } gpr.Unlock(rA, rB); SetJumpTarget(DoNotLoad); }
void JitArmILAsmRoutineManager::Generate() { enterCode = GetCodePtr(); PUSH(9, R4, R5, R6, R7, R8, R9, R10, R11, _LR); // Take care to 8-byte align stack for function calls. // We are misaligned here because of an odd number of args for PUSH. // It's not like x86 where you need to account for an extra 4 bytes // consumed by CALL. SUB(_SP, _SP, 4); MOVI2R(R0, (u32)&CoreTiming::downcount); MOVI2R(R9, (u32)&PowerPC::ppcState.spr[0]); FixupBranch skipToRealDispatcher = B(); dispatcher = GetCodePtr(); printf("ILDispatcher is %p\n", dispatcher); // Downcount Check // The result of slice decrementation should be in flags if somebody jumped here // IMPORTANT - We jump on negative, not carry!!! FixupBranch bail = B_CC(CC_MI); SetJumpTarget(skipToRealDispatcher); dispatcherNoCheck = GetCodePtr(); // This block of code gets the address of the compiled block of code // It runs though to the compiling portion if it isn't found LDR(R12, R9, PPCSTATE_OFF(pc));// Load the current PC into R12 Operand2 iCacheMask = Operand2(0xE, 2); // JIT_ICACHE_MASK BIC(R12, R12, iCacheMask); // R12 contains PC & JIT_ICACHE_MASK here. MOVI2R(R14, (u32)jit->GetBlockCache()->iCache); LDR(R12, R14, R12); // R12 contains iCache[PC & JIT_ICACHE_MASK] here // R12 Confirmed this is the correct iCache Location loaded. TST(R12, 0x80); // Test to see if it is a JIT block. SetCC(CC_EQ); // Success, it is our Jitblock. MOVI2R(R14, (u32)jit->GetBlockCache()->GetCodePointers()); // LDR R14 right here to get CodePointers()[0] pointer. LSL(R12, R12, 2); // Multiply by four because address locations are u32 in size LDR(R14, R14, R12); // Load the block address in to R14 B(R14); // No need to jump anywhere after here, the block will go back to dispatcher start SetCC(); // If we get to this point, that means that we don't have the block cached to execute // So call ArmJit to compile the block and then execute it. MOVI2R(R14, (u32)&Jit); BL(R14); B(dispatcherNoCheck); // fpException() // Floating Point Exception Check, Jumped to if false fpException = GetCodePtr(); LDR(R0, R9, PPCSTATE_OFF(Exceptions)); ORR(R0, R0, EXCEPTION_FPU_UNAVAILABLE); STR(R0, R9, PPCSTATE_OFF(Exceptions)); QuickCallFunction(R14, (void*)&PowerPC::CheckExceptions); LDR(R0, R9, PPCSTATE_OFF(npc)); STR(R0, R9, PPCSTATE_OFF(pc)); B(dispatcher); SetJumpTarget(bail); doTiming = GetCodePtr(); // XXX: In JIT64, Advance() gets called /after/ the exception checking // once it jumps back to the start of outerLoop QuickCallFunction(R14, (void*)&CoreTiming::Advance); // Does exception checking testExceptions = GetCodePtr(); LDR(R0, R9, PPCSTATE_OFF(pc)); STR(R0, R9, PPCSTATE_OFF(npc)); QuickCallFunction(R14, (void*)&PowerPC::CheckExceptions); LDR(R0, R9, PPCSTATE_OFF(npc)); STR(R0, R9, PPCSTATE_OFF(pc)); // Check the state pointer to see if we are exiting // Gets checked on every exception check MOVI2R(R0, (u32)PowerPC::GetStatePtr()); MVN(R1, 0); LDR(R0, R0); TST(R0, R1); FixupBranch Exit = B_CC(CC_NEQ); B(dispatcher); SetJumpTarget(Exit); ADD(_SP, _SP, 4); POP(9, R4, R5, R6, R7, R8, R9, R10, R11, _PC); // Returns GenerateCommon(); FlushIcache(); }
void Jit::Comp_FPU2op(MIPSOpcode op) { CONDITIONAL_DISABLE; int fs = _FS; int fd = _FD; switch (op & 0x3f) { case 5: //F(fd) = fabsf(F(fs)); break; //abs fpr.SpillLock(fd, fs); fpr.MapReg(fd, fd == fs, true); MOVSS(fpr.RX(fd), fpr.R(fs)); PAND(fpr.RX(fd), M(ssNoSignMask)); break; case 6: //F(fd) = F(fs); break; //mov if (fd != fs) { fpr.SpillLock(fd, fs); fpr.MapReg(fd, fd == fs, true); MOVSS(fpr.RX(fd), fpr.R(fs)); } break; case 7: //F(fd) = -F(fs); break; //neg fpr.SpillLock(fd, fs); fpr.MapReg(fd, fd == fs, true); MOVSS(fpr.RX(fd), fpr.R(fs)); PXOR(fpr.RX(fd), M(ssSignBits2)); break; case 4: //F(fd) = sqrtf(F(fs)); break; //sqrt fpr.SpillLock(fd, fs); // this probably works, just badly tested fpr.MapReg(fd, fd == fs, true); SQRTSS(fpr.RX(fd), fpr.R(fs)); break; case 13: //FsI(fd) = F(fs)>=0 ? (int)floorf(F(fs)) : (int)ceilf(F(fs)); break;//trunc.w.s { fpr.SpillLock(fs, fd); fpr.StoreFromRegister(fd); CVTTSS2SI(EAX, fpr.R(fs)); // Did we get an indefinite integer value? CMP(32, R(EAX), Imm32(0x80000000)); FixupBranch skip = J_CC(CC_NE); MOVSS(XMM0, fpr.R(fs)); XORPS(XMM1, R(XMM1)); CMPSS(XMM0, R(XMM1), CMP_LT); // At this point, -inf = 0xffffffff, inf/nan = 0x00000000. // We want -inf to be 0x80000000 inf/nan to be 0x7fffffff, so we flip those bits. MOVD_xmm(R(EAX), XMM0); XOR(32, R(EAX), Imm32(0x7fffffff)); SetJumpTarget(skip); MOV(32, fpr.R(fd), R(EAX)); } break; case 32: //F(fd) = (float)FsI(fs); break; //cvt.s.w // Store to memory so we can read it as an integer value. fpr.StoreFromRegister(fs); CVTSI2SS(XMM0, fpr.R(fs)); MOVSS(fpr.R(fd), XMM0); break; case 12: //FsI(fd) = (int)floorf(F(fs)+0.5f); break; //round.w.s case 14: //FsI(fd) = (int)ceilf (F(fs)); break; //ceil.w.s case 15: //FsI(fd) = (int)floorf(F(fs)); break; //floor.w.s case 36: //FsI(fd) = (int) F(fs); break; //cvt.w.s default: DISABLE; return; } fpr.ReleaseSpillLocks(); }
void Jit::Comp_FPU2op(MIPSOpcode op) { CONDITIONAL_DISABLE(FPU); int fs = _FS; int fd = _FD; auto execRounding = [&](void (XEmitter::*conv)(X64Reg, OpArg), int setMXCSR) { fpr.SpillLock(fd, fs); fpr.MapReg(fd, fs == fd, true); // Small optimization: 0 is our default mode anyway. if (setMXCSR == 0 && !js.hasSetRounding) { setMXCSR = -1; } if (setMXCSR != -1) { STMXCSR(MIPSSTATE_VAR(mxcsrTemp)); MOV(32, R(TEMPREG), MIPSSTATE_VAR(mxcsrTemp)); AND(32, R(TEMPREG), Imm32(~(3 << 13))); OR(32, R(TEMPREG), Imm32(setMXCSR << 13)); MOV(32, MIPSSTATE_VAR(temp), R(TEMPREG)); LDMXCSR(MIPSSTATE_VAR(temp)); } (this->*conv)(TEMPREG, fpr.R(fs)); // Did we get an indefinite integer value? CMP(32, R(TEMPREG), Imm32(0x80000000)); FixupBranch skip = J_CC(CC_NE); if (fd != fs) { CopyFPReg(fpr.RX(fd), fpr.R(fs)); } XORPS(XMM1, R(XMM1)); CMPSS(fpr.RX(fd), R(XMM1), CMP_LT); // At this point, -inf = 0xffffffff, inf/nan = 0x00000000. // We want -inf to be 0x80000000 inf/nan to be 0x7fffffff, so we flip those bits. MOVD_xmm(R(TEMPREG), fpr.RX(fd)); XOR(32, R(TEMPREG), Imm32(0x7fffffff)); SetJumpTarget(skip); MOVD_xmm(fpr.RX(fd), R(TEMPREG)); if (setMXCSR != -1) { LDMXCSR(MIPSSTATE_VAR(mxcsrTemp)); } }; switch (op & 0x3f) { case 5: //F(fd) = fabsf(F(fs)); break; //abs fpr.SpillLock(fd, fs); fpr.MapReg(fd, fd == fs, true); MOV(PTRBITS, R(TEMPREG), ImmPtr(&ssNoSignMask[0])); if (fd != fs && fpr.IsMapped(fs)) { MOVAPS(fpr.RX(fd), MatR(TEMPREG)); ANDPS(fpr.RX(fd), fpr.R(fs)); } else { if (fd != fs) { MOVSS(fpr.RX(fd), fpr.R(fs)); } ANDPS(fpr.RX(fd), MatR(TEMPREG)); } break; case 6: //F(fd) = F(fs); break; //mov if (fd != fs) { fpr.SpillLock(fd, fs); fpr.MapReg(fd, fd == fs, true); CopyFPReg(fpr.RX(fd), fpr.R(fs)); } break; case 7: //F(fd) = -F(fs); break; //neg fpr.SpillLock(fd, fs); fpr.MapReg(fd, fd == fs, true); MOV(PTRBITS, R(TEMPREG), ImmPtr(&ssSignBits2[0])); if (fd != fs && fpr.IsMapped(fs)) { MOVAPS(fpr.RX(fd), MatR(TEMPREG)); XORPS(fpr.RX(fd), fpr.R(fs)); } else { if (fd != fs) { MOVSS(fpr.RX(fd), fpr.R(fs)); } XORPS(fpr.RX(fd), MatR(TEMPREG)); } break; case 4: //F(fd) = sqrtf(F(fs)); break; //sqrt fpr.SpillLock(fd, fs); fpr.MapReg(fd, fd == fs, true); SQRTSS(fpr.RX(fd), fpr.R(fs)); break; case 13: //FsI(fd) = F(fs)>=0 ? (int)floorf(F(fs)) : (int)ceilf(F(fs)); break; //trunc.w.s execRounding(&XEmitter::CVTTSS2SI, -1); break; case 32: //F(fd) = (float)FsI(fs); break; //cvt.s.w fpr.SpillLock(fd, fs); fpr.MapReg(fd, fs == fd, true); if (fpr.IsMapped(fs)) { CVTDQ2PS(fpr.RX(fd), fpr.R(fs)); } else { // If fs was fd, we'd be in the case above since we mapped fd. MOVSS(fpr.RX(fd), fpr.R(fs)); CVTDQ2PS(fpr.RX(fd), fpr.R(fd)); } break; case 36: //FsI(fd) = (int) F(fs); break; //cvt.w.s // Uses the current rounding mode. execRounding(&XEmitter::CVTSS2SI, -1); break; case 12: //FsI(fd) = (int)floorf(F(fs)+0.5f); break; //round.w.s execRounding(&XEmitter::CVTSS2SI, 0); break; case 14: //FsI(fd) = (int)ceilf (F(fs)); break; //ceil.w.s execRounding(&XEmitter::CVTSS2SI, 2); break; case 15: //FsI(fd) = (int)floorf(F(fs)); break; //floor.w.s execRounding(&XEmitter::CVTSS2SI, 1); break; default: DISABLE; return; } fpr.ReleaseSpillLocks(); }
void JitArm::stX(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStoreOff); u32 a = inst.RA, b = inst.RB, s = inst.RS; s32 offset = inst.SIMM_16; u32 accessSize = 0; s32 regOffset = -1; bool update = false; bool fastmem = false; switch (inst.OPCD) { case 45: // sthu update = true; case 44: // sth accessSize = 16; break; case 31: switch (inst.SUBOP10) { case 183: // stwux update = true; case 151: // stwx fastmem = true; accessSize = 32; regOffset = b; break; case 247: // stbux update = true; case 215: // stbx accessSize = 8; regOffset = b; break; case 439: // sthux update = true; case 407: // sthx accessSize = 16; regOffset = b; break; } break; case 37: // stwu update = true; case 36: // stw fastmem = true; accessSize = 32; break; case 39: // stbu update = true; case 38: // stb accessSize = 8; break; } SafeStoreFromReg(fastmem, update ? a : (a ? a : -1), s, regOffset, accessSize, offset); if (update) { ARMReg rA = gpr.GetReg(); ARMReg RB; ARMReg RA = gpr.R(a); if (regOffset != -1) RB = gpr.R(regOffset); // Check for DSI exception prior to writing back address LDR(rA, R9, PPCSTATE_OFF(Exceptions)); TST(rA, EXCEPTION_DSI); FixupBranch DoNotWrite = B_CC(CC_NEQ); if (a) { if (regOffset == -1) { MOVI2R(rA, offset); ADD(RA, RA, rA); } else { ADD(RA, RA, RB); } } else { if (regOffset == -1) MOVI2R(RA, (u32)offset); else MOV(RA, RB); } SetJumpTarget(DoNotWrite); gpr.Unlock(rA); } }
void MIPSEmitter::BNE(MIPSReg rs, MIPSReg rt, const void *func, std::function<void ()> delaySlot) { SetJumpTarget(BNE(rs, rt, delaySlot), func); }
void Jit64::fcmpx(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITFloatingPointOff); FALLBACK_IF(jo.fpAccurateFcmp); //bool ordered = inst.SUBOP10 == 32; int a = inst.FA; int b = inst.FB; int crf = inst.CRFD; fpr.Lock(a,b); fpr.BindToRegister(b, true); // Are we masking sNaN invalid floating point exceptions? If not this could crash if we don't handle the exception? UCOMISD(fpr.R(b).GetSimpleReg(), fpr.R(a)); FixupBranch pNaN, pLesser, pGreater; FixupBranch continue1, continue2, continue3; if (a != b) { // if B > A, goto Lesser's jump target pLesser = J_CC(CC_A); } // if (B != B) or (A != A), goto NaN's jump target pNaN = J_CC(CC_P); if (a != b) { // if B < A, goto Greater's jump target // JB can't precede the NaN check because it doesn't test ZF pGreater = J_CC(CC_B); } // Equal MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x2)); continue1 = J(); // NAN SetJumpTarget(pNaN); MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x1)); if (a != b) { continue2 = J(); // Greater Than SetJumpTarget(pGreater); MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x4)); continue3 = J(); // Less Than SetJumpTarget(pLesser); MOV(8, M(&PowerPC::ppcState.cr_fast[crf]), Imm8(0x8)); } SetJumpTarget(continue1); if (a != b) { SetJumpTarget(continue2); SetJumpTarget(continue3); } fpr.UnlockAll(); }
const u8 *Jit::DoJit(u32 em_address, JitBlock *b) { js.cancel = false; js.blockStart = js.compilerPC = mips_->pc; js.nextExit = 0; js.downcountAmount = 0; js.curBlock = b; js.compiling = true; js.inDelaySlot = false; js.afterOp = JitState::AFTER_NONE; js.PrefixStart(); // We add a check before the block, used when entering from a linked block. b->checkedEntry = GetCodePtr(); // Downcount flag check. The last block decremented downcounter, and the flag should still be available. FixupBranch skip = J_CC(CC_NBE); MOV(32, M(&mips_->pc), Imm32(js.blockStart)); JMP(asm_.outerLoop, true); // downcount hit zero - go advance. SetJumpTarget(skip); b->normalEntry = GetCodePtr(); MIPSAnalyst::AnalysisResults analysis = MIPSAnalyst::Analyze(em_address); gpr.Start(mips_, analysis); fpr.Start(mips_, analysis); js.numInstructions = 0; while (js.compiling) { // Jit breakpoints are quite fast, so let's do them in release too. CheckJitBreakpoint(js.compilerPC, 0); MIPSOpcode inst = Memory::Read_Instruction(js.compilerPC); js.downcountAmount += MIPSGetInstructionCycleEstimate(inst); MIPSCompileOp(inst); if (js.afterOp & JitState::AFTER_CORE_STATE) { // TODO: Save/restore? FlushAll(); // If we're rewinding, CORE_NEXTFRAME should not cause a rewind. // It doesn't really matter either way if we're not rewinding. // CORE_RUNNING is <= CORE_NEXTFRAME. CMP(32, M((void*)&coreState), Imm32(CORE_NEXTFRAME)); FixupBranch skipCheck = J_CC(CC_LE); if (js.afterOp & JitState::AFTER_REWIND_PC_BAD_STATE) MOV(32, M(&mips_->pc), Imm32(js.compilerPC)); else MOV(32, M(&mips_->pc), Imm32(js.compilerPC + 4)); WriteSyscallExit(); SetJumpTarget(skipCheck); js.afterOp = JitState::AFTER_NONE; } js.compilerPC += 4; js.numInstructions++; // Safety check, in case we get a bunch of really large jit ops without a lot of branching. if (GetSpaceLeft() < 0x800) { FlushAll(); WriteExit(js.compilerPC, js.nextExit++); js.compiling = false; } } b->codeSize = (u32)(GetCodePtr() - b->normalEntry); NOP(); AlignCode4(); b->originalSize = js.numInstructions; return b->normalEntry; }