void Jit::Comp_mxc1(MIPSOpcode op) { CONDITIONAL_DISABLE; int fs = _FS; MIPSGPReg rt = _RT; switch((op >> 21) & 0x1f) { case 0: // R(rt) = FI(fs); break; //mfc1 if (rt != MIPS_REG_ZERO) { fpr.MapReg(fs, true, false); // TODO: Seems the V register becomes dirty here? It shouldn't. gpr.MapReg(rt, false, true); MOVD_xmm(gpr.R(rt), fpr.RX(fs)); } break; case 2: // R(rt) = currentMIPS->ReadFCR(fs); break; //cfc1 Comp_Generic(op); return; case 4: //FI(fs) = R(rt); break; //mtc1 gpr.MapReg(rt, true, false); fpr.MapReg(fs, false, true); MOVD_xmm(fpr.RX(fs), gpr.R(rt)); return; case 6: //currentMIPS->WriteFCR(fs, R(rt)); break; //ctc1 Comp_Generic(op); return; } }
bool SamplerJitCache::Jit_Decode4444() { MOVD_xmm(fpScratchReg1, R(resultReg)); PUNPCKLBW(fpScratchReg1, R(fpScratchReg1)); if (RipAccessible(color4444mask)) { PAND(fpScratchReg1, M(color4444mask)); // rip accessible } else { Crash(); } MOVSS(fpScratchReg2, R(fpScratchReg1)); MOVSS(fpScratchReg3, R(fpScratchReg1)); PSRLW(fpScratchReg2, 4); PSLLW(fpScratchReg3, 4); POR(fpScratchReg1, R(fpScratchReg2)); POR(fpScratchReg1, R(fpScratchReg3)); MOVD_xmm(R(resultReg), fpScratchReg1); return true; }
void Jit::CompFPComp(int lhs, int rhs, u8 compare, bool allowNaN) { gpr.MapReg(MIPS_REG_FPCOND, false, true); MOVSS(XMM0, fpr.R(lhs)); CMPSS(XMM0, fpr.R(rhs), compare); MOVD_xmm(gpr.R(MIPS_REG_FPCOND), XMM0); // This means that NaN also means true, e.g. !<> or !>, etc. if (allowNaN) { MOVSS(XMM0, fpr.R(lhs)); CMPUNORDSS(XMM0, fpr.R(rhs)); MOVD_xmm(R(EAX), XMM0); OR(32, gpr.R(MIPS_REG_FPCOND), R(EAX)); } }
void Jit::Comp_FPULS(MIPSOpcode op) { CONDITIONAL_DISABLE; s32 offset = _IMM16; int ft = _FT; MIPSGPReg rs = _RS; switch(op >> 26) { case 49: //FI(ft) = Memory::Read_U32(addr); break; //lwc1 { gpr.Lock(rs); fpr.SpillLock(ft); fpr.MapReg(ft, false, true); JitSafeMem safe(this, rs, offset); OpArg src; if (safe.PrepareRead(src, 4)) MOVSS(fpr.RX(ft), src); if (safe.PrepareSlowRead(safeMemFuncs.readU32)) MOVD_xmm(fpr.RX(ft), R(EAX)); safe.Finish(); gpr.UnlockAll(); fpr.ReleaseSpillLocks(); } break; case 57: //Memory::Write_U32(FI(ft), addr); break; //swc1 { gpr.Lock(rs); fpr.SpillLock(ft); fpr.MapReg(ft, true, false); JitSafeMem safe(this, rs, offset); OpArg dest; if (safe.PrepareWrite(dest, 4)) MOVSS(dest, fpr.RX(ft)); if (safe.PrepareSlowWrite()) { MOVSS(M(&ssLoadStoreTemp), fpr.RX(ft)); safe.DoSlowWrite(safeMemFuncs.writeU32, M(&ssLoadStoreTemp)); } safe.Finish(); gpr.UnlockAll(); fpr.ReleaseSpillLocks(); } break; default: _dbg_assert_msg_(CPU,0,"Trying to interpret FPULS instruction that can't be interpreted"); break; } }
void Jit::CompFPComp(int lhs, int rhs, u8 compare, bool allowNaN) { gpr.MapReg(MIPS_REG_FPCOND, false, true); // This means that NaN also means true, e.g. !<> or !>, etc. if (allowNaN) { CopyFPReg(XMM0, fpr.R(lhs)); CopyFPReg(XMM1, fpr.R(lhs)); CMPSS(XMM0, fpr.R(rhs), compare); CMPUNORDSS(XMM1, fpr.R(rhs)); POR(XMM0, R(XMM1)); } else { CopyFPReg(XMM0, fpr.R(lhs)); CMPSS(XMM0, fpr.R(rhs), compare); } MOVD_xmm(gpr.R(MIPS_REG_FPCOND), XMM0); }
void Jit::Comp_FPU2op(MIPSOpcode op) { CONDITIONAL_DISABLE; int fs = _FS; int fd = _FD; switch (op & 0x3f) { case 5: //F(fd) = fabsf(F(fs)); break; //abs fpr.SpillLock(fd, fs); fpr.MapReg(fd, fd == fs, true); MOVSS(fpr.RX(fd), fpr.R(fs)); PAND(fpr.RX(fd), M(ssNoSignMask)); break; case 6: //F(fd) = F(fs); break; //mov if (fd != fs) { fpr.SpillLock(fd, fs); fpr.MapReg(fd, fd == fs, true); MOVSS(fpr.RX(fd), fpr.R(fs)); } break; case 7: //F(fd) = -F(fs); break; //neg fpr.SpillLock(fd, fs); fpr.MapReg(fd, fd == fs, true); MOVSS(fpr.RX(fd), fpr.R(fs)); PXOR(fpr.RX(fd), M(ssSignBits2)); break; case 4: //F(fd) = sqrtf(F(fs)); break; //sqrt fpr.SpillLock(fd, fs); // this probably works, just badly tested fpr.MapReg(fd, fd == fs, true); SQRTSS(fpr.RX(fd), fpr.R(fs)); break; case 13: //FsI(fd) = F(fs)>=0 ? (int)floorf(F(fs)) : (int)ceilf(F(fs)); break;//trunc.w.s { fpr.SpillLock(fs, fd); fpr.StoreFromRegister(fd); CVTTSS2SI(EAX, fpr.R(fs)); // Did we get an indefinite integer value? CMP(32, R(EAX), Imm32(0x80000000)); FixupBranch skip = J_CC(CC_NE); MOVSS(XMM0, fpr.R(fs)); XORPS(XMM1, R(XMM1)); CMPSS(XMM0, R(XMM1), CMP_LT); // At this point, -inf = 0xffffffff, inf/nan = 0x00000000. // We want -inf to be 0x80000000 inf/nan to be 0x7fffffff, so we flip those bits. MOVD_xmm(R(EAX), XMM0); XOR(32, R(EAX), Imm32(0x7fffffff)); SetJumpTarget(skip); MOV(32, fpr.R(fd), R(EAX)); } break; case 32: //F(fd) = (float)FsI(fs); break; //cvt.s.w // Store to memory so we can read it as an integer value. fpr.StoreFromRegister(fs); CVTSI2SS(XMM0, fpr.R(fs)); MOVSS(fpr.R(fd), XMM0); break; case 12: //FsI(fd) = (int)floorf(F(fs)+0.5f); break; //round.w.s case 14: //FsI(fd) = (int)ceilf (F(fs)); break; //ceil.w.s case 15: //FsI(fd) = (int)floorf(F(fs)); break; //floor.w.s case 36: //FsI(fd) = (int) F(fs); break; //cvt.w.s default: DISABLE; return; } fpr.ReleaseSpillLocks(); }
void Jit::Comp_mxc1(MIPSOpcode op) { CONDITIONAL_DISABLE; int fs = _FS; MIPSGPReg rt = _RT; switch((op >> 21) & 0x1f) { case 0: // R(rt) = FI(fs); break; //mfc1 if (rt != MIPS_REG_ZERO) { fpr.MapReg(fs, true, false); // TODO: Seems the V register becomes dirty here? It shouldn't. gpr.MapReg(rt, false, true); MOVD_xmm(gpr.R(rt), fpr.RX(fs)); } break; case 2: // R(rt) = currentMIPS->ReadFCR(fs); break; //cfc1 if (fs == 31) { bool wasImm = gpr.IsImm(MIPS_REG_FPCOND); if (!wasImm) { gpr.Lock(rt, MIPS_REG_FPCOND); gpr.MapReg(MIPS_REG_FPCOND, true, false); } gpr.MapReg(rt, false, true); MOV(32, gpr.R(rt), M(&mips_->fcr31)); if (wasImm) { if (gpr.GetImm(MIPS_REG_FPCOND) & 1) { OR(32, gpr.R(rt), Imm32(1 << 23)); } else { AND(32, gpr.R(rt), Imm32(~(1 << 23))); } } else { AND(32, gpr.R(rt), Imm32(~(1 << 23))); MOV(32, R(EAX), gpr.R(MIPS_REG_FPCOND)); AND(32, R(EAX), Imm32(1)); SHL(32, R(EAX), Imm8(23)); OR(32, gpr.R(rt), R(EAX)); } gpr.UnlockAll(); } else if (fs == 0) { gpr.SetImm(rt, MIPSState::FCR0_VALUE); } else { Comp_Generic(op); } return; case 4: //FI(fs) = R(rt); break; //mtc1 gpr.MapReg(rt, true, false); fpr.MapReg(fs, false, true); MOVD_xmm(fpr.RX(fs), gpr.R(rt)); return; case 6: //currentMIPS->WriteFCR(fs, R(rt)); break; //ctc1 if (fs == 31) { if (gpr.IsImm(rt)) { gpr.SetImm(MIPS_REG_FPCOND, (gpr.GetImm(rt) >> 23) & 1); MOV(32, M(&mips_->fcr31), Imm32(gpr.GetImm(rt) & 0x0181FFFF)); } else { gpr.Lock(rt, MIPS_REG_FPCOND); gpr.MapReg(rt, true, false); gpr.MapReg(MIPS_REG_FPCOND, false, true); MOV(32, gpr.R(MIPS_REG_FPCOND), gpr.R(rt)); SHR(32, gpr.R(MIPS_REG_FPCOND), Imm8(23)); AND(32, gpr.R(MIPS_REG_FPCOND), Imm32(1)); MOV(32, M(&mips_->fcr31), gpr.R(rt)); AND(32, M(&mips_->fcr31), Imm32(0x0181FFFF)); gpr.UnlockAll(); } } else {
void Jit::Comp_mxc1(MIPSOpcode op) { CONDITIONAL_DISABLE(FPU_XFER); int fs = _FS; MIPSGPReg rt = _RT; switch ((op >> 21) & 0x1f) { case 0: // R(rt) = FI(fs); break; //mfc1 if (rt == MIPS_REG_ZERO) return; gpr.MapReg(rt, false, true); // If fs is not mapped, most likely it's being abandoned. // Just load from memory in that case. if (fpr.R(fs).IsSimpleReg()) { MOVD_xmm(gpr.R(rt), fpr.RX(fs)); } else { MOV(32, gpr.R(rt), fpr.R(fs)); } break; case 2: // R(rt) = currentMIPS->ReadFCR(fs); break; //cfc1 if (rt == MIPS_REG_ZERO) return; if (fs == 31) { bool wasImm = gpr.IsImm(MIPS_REG_FPCOND); if (!wasImm) { gpr.Lock(rt, MIPS_REG_FPCOND); gpr.MapReg(MIPS_REG_FPCOND, true, false); } gpr.MapReg(rt, false, true); MOV(32, gpr.R(rt), MIPSSTATE_VAR(fcr31)); if (wasImm) { if (gpr.GetImm(MIPS_REG_FPCOND) & 1) { OR(32, gpr.R(rt), Imm32(1 << 23)); } else { AND(32, gpr.R(rt), Imm32(~(1 << 23))); } } else { AND(32, gpr.R(rt), Imm32(~(1 << 23))); MOV(32, R(TEMPREG), gpr.R(MIPS_REG_FPCOND)); AND(32, R(TEMPREG), Imm32(1)); SHL(32, R(TEMPREG), Imm8(23)); OR(32, gpr.R(rt), R(TEMPREG)); } gpr.UnlockAll(); } else if (fs == 0) { gpr.SetImm(rt, MIPSState::FCR0_VALUE); } else { Comp_Generic(op); } return; case 4: //FI(fs) = R(rt); break; //mtc1 fpr.MapReg(fs, false, true); if (gpr.IsImm(rt) && gpr.GetImm(rt) == 0) { XORPS(fpr.RX(fs), fpr.R(fs)); } else { gpr.KillImmediate(rt, true, false); MOVD_xmm(fpr.RX(fs), gpr.R(rt)); } return; case 6: //currentMIPS->WriteFCR(fs, R(rt)); break; //ctc1 if (fs == 31) { // Must clear before setting, since ApplyRoundingMode() assumes it was cleared. RestoreRoundingMode(); if (gpr.IsImm(rt)) { gpr.SetImm(MIPS_REG_FPCOND, (gpr.GetImm(rt) >> 23) & 1); MOV(32, MIPSSTATE_VAR(fcr31), Imm32(gpr.GetImm(rt) & 0x0181FFFF)); if ((gpr.GetImm(rt) & 0x1000003) == 0) { // Default nearest / no-flush mode, just leave it cleared. } else { UpdateRoundingMode(gpr.GetImm(rt)); ApplyRoundingMode(); } } else {
void Jit::Comp_FPU2op(MIPSOpcode op) { CONDITIONAL_DISABLE(FPU); int fs = _FS; int fd = _FD; auto execRounding = [&](void (XEmitter::*conv)(X64Reg, OpArg), int setMXCSR) { fpr.SpillLock(fd, fs); fpr.MapReg(fd, fs == fd, true); // Small optimization: 0 is our default mode anyway. if (setMXCSR == 0 && !js.hasSetRounding) { setMXCSR = -1; } if (setMXCSR != -1) { STMXCSR(MIPSSTATE_VAR(mxcsrTemp)); MOV(32, R(TEMPREG), MIPSSTATE_VAR(mxcsrTemp)); AND(32, R(TEMPREG), Imm32(~(3 << 13))); OR(32, R(TEMPREG), Imm32(setMXCSR << 13)); MOV(32, MIPSSTATE_VAR(temp), R(TEMPREG)); LDMXCSR(MIPSSTATE_VAR(temp)); } (this->*conv)(TEMPREG, fpr.R(fs)); // Did we get an indefinite integer value? CMP(32, R(TEMPREG), Imm32(0x80000000)); FixupBranch skip = J_CC(CC_NE); if (fd != fs) { CopyFPReg(fpr.RX(fd), fpr.R(fs)); } XORPS(XMM1, R(XMM1)); CMPSS(fpr.RX(fd), R(XMM1), CMP_LT); // At this point, -inf = 0xffffffff, inf/nan = 0x00000000. // We want -inf to be 0x80000000 inf/nan to be 0x7fffffff, so we flip those bits. MOVD_xmm(R(TEMPREG), fpr.RX(fd)); XOR(32, R(TEMPREG), Imm32(0x7fffffff)); SetJumpTarget(skip); MOVD_xmm(fpr.RX(fd), R(TEMPREG)); if (setMXCSR != -1) { LDMXCSR(MIPSSTATE_VAR(mxcsrTemp)); } }; switch (op & 0x3f) { case 5: //F(fd) = fabsf(F(fs)); break; //abs fpr.SpillLock(fd, fs); fpr.MapReg(fd, fd == fs, true); MOV(PTRBITS, R(TEMPREG), ImmPtr(&ssNoSignMask[0])); if (fd != fs && fpr.IsMapped(fs)) { MOVAPS(fpr.RX(fd), MatR(TEMPREG)); ANDPS(fpr.RX(fd), fpr.R(fs)); } else { if (fd != fs) { MOVSS(fpr.RX(fd), fpr.R(fs)); } ANDPS(fpr.RX(fd), MatR(TEMPREG)); } break; case 6: //F(fd) = F(fs); break; //mov if (fd != fs) { fpr.SpillLock(fd, fs); fpr.MapReg(fd, fd == fs, true); CopyFPReg(fpr.RX(fd), fpr.R(fs)); } break; case 7: //F(fd) = -F(fs); break; //neg fpr.SpillLock(fd, fs); fpr.MapReg(fd, fd == fs, true); MOV(PTRBITS, R(TEMPREG), ImmPtr(&ssSignBits2[0])); if (fd != fs && fpr.IsMapped(fs)) { MOVAPS(fpr.RX(fd), MatR(TEMPREG)); XORPS(fpr.RX(fd), fpr.R(fs)); } else { if (fd != fs) { MOVSS(fpr.RX(fd), fpr.R(fs)); } XORPS(fpr.RX(fd), MatR(TEMPREG)); } break; case 4: //F(fd) = sqrtf(F(fs)); break; //sqrt fpr.SpillLock(fd, fs); fpr.MapReg(fd, fd == fs, true); SQRTSS(fpr.RX(fd), fpr.R(fs)); break; case 13: //FsI(fd) = F(fs)>=0 ? (int)floorf(F(fs)) : (int)ceilf(F(fs)); break; //trunc.w.s execRounding(&XEmitter::CVTTSS2SI, -1); break; case 32: //F(fd) = (float)FsI(fs); break; //cvt.s.w fpr.SpillLock(fd, fs); fpr.MapReg(fd, fs == fd, true); if (fpr.IsMapped(fs)) { CVTDQ2PS(fpr.RX(fd), fpr.R(fs)); } else { // If fs was fd, we'd be in the case above since we mapped fd. MOVSS(fpr.RX(fd), fpr.R(fs)); CVTDQ2PS(fpr.RX(fd), fpr.R(fd)); } break; case 36: //FsI(fd) = (int) F(fs); break; //cvt.w.s // Uses the current rounding mode. execRounding(&XEmitter::CVTSS2SI, -1); break; case 12: //FsI(fd) = (int)floorf(F(fs)+0.5f); break; //round.w.s execRounding(&XEmitter::CVTSS2SI, 0); break; case 14: //FsI(fd) = (int)ceilf (F(fs)); break; //ceil.w.s execRounding(&XEmitter::CVTSS2SI, 2); break; case 15: //FsI(fd) = (int)floorf(F(fs)); break; //floor.w.s execRounding(&XEmitter::CVTSS2SI, 1); break; default: DISABLE; return; } fpr.ReleaseSpillLocks(); }
LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) { _assert_msg_(G3D, id.linear, "Linear should be set on sampler id"); BeginWrite(); // We'll first write the nearest sampler, which we will CALL. // This may differ slightly based on the "linear" flag. const u8 *nearest = AlignCode16(); if (!Jit_ReadTextureFormat(id)) { EndWrite(); SetCodePtr(const_cast<u8 *>(nearest)); return nullptr; } RET(); // Now the actual linear func, which is exposed externally. const u8 *start = AlignCode16(); // NOTE: This doesn't use the general register mapping. // POSIX: arg1=uptr, arg2=vptr, arg3=frac_u, arg4=frac_v, arg5=src, arg6=bufw, stack+8=level // Win64: arg1=uptr, arg2=vptr, arg3=frac_u, arg4=frac_v, stack+40=src, stack+48=bufw, stack+56=level // // We map these to nearest CALLs, with order: u, v, src, bufw, level // Let's start by saving a bunch of registers. PUSH(R15); PUSH(R14); PUSH(R13); PUSH(R12); // Won't need frac_u/frac_v for a while. PUSH(arg4Reg); PUSH(arg3Reg); // Extra space to restore alignment and save resultReg for lerp. // TODO: Maybe use XMMs instead? SUB(64, R(RSP), Imm8(24)); MOV(64, R(R12), R(arg1Reg)); MOV(64, R(R13), R(arg2Reg)); #ifdef _WIN32 // First arg now starts at 24 (extra space) + 48 (pushed stack) + 8 (ret address) + 32 (shadow space) const int argOffset = 24 + 48 + 8 + 32; MOV(64, R(R14), MDisp(RSP, argOffset)); MOV(32, R(R15), MDisp(RSP, argOffset + 8)); // level is at argOffset + 16. #else MOV(64, R(R14), R(arg5Reg)); MOV(32, R(R15), R(arg6Reg)); // level is at 24 + 48 + 8. #endif // Early exit on !srcPtr. FixupBranch zeroSrc; if (id.hasInvalidPtr) { CMP(PTRBITS, R(R14), Imm8(0)); FixupBranch nonZeroSrc = J_CC(CC_NZ); XOR(32, R(RAX), R(RAX)); zeroSrc = J(true); SetJumpTarget(nonZeroSrc); } // At this point: // R12=uptr, R13=vptr, stack+24=frac_u, stack+32=frac_v, R14=src, R15=bufw, stack+X=level auto doNearestCall = [&](int off) { MOV(32, R(uReg), MDisp(R12, off)); MOV(32, R(vReg), MDisp(R13, off)); MOV(64, R(srcReg), R(R14)); MOV(32, R(bufwReg), R(R15)); // Leave level, we just always load from RAM. Separate CLUTs is uncommon. CALL(nearest); MOV(32, MDisp(RSP, off), R(resultReg)); }; doNearestCall(0); doNearestCall(4); doNearestCall(8); doNearestCall(12); // Convert TL, TR, BL, BR to floats for easier blending. if (!cpu_info.bSSE4_1) { PXOR(XMM0, R(XMM0)); } MOVD_xmm(fpScratchReg1, MDisp(RSP, 0)); MOVD_xmm(fpScratchReg2, MDisp(RSP, 4)); MOVD_xmm(fpScratchReg3, MDisp(RSP, 8)); MOVD_xmm(fpScratchReg4, MDisp(RSP, 12)); if (cpu_info.bSSE4_1) { PMOVZXBD(fpScratchReg1, R(fpScratchReg1)); PMOVZXBD(fpScratchReg2, R(fpScratchReg2)); PMOVZXBD(fpScratchReg3, R(fpScratchReg3)); PMOVZXBD(fpScratchReg4, R(fpScratchReg4)); } else { PUNPCKLBW(fpScratchReg1, R(XMM0)); PUNPCKLBW(fpScratchReg2, R(XMM0)); PUNPCKLBW(fpScratchReg3, R(XMM0)); PUNPCKLBW(fpScratchReg4, R(XMM0)); PUNPCKLWD(fpScratchReg1, R(XMM0)); PUNPCKLWD(fpScratchReg2, R(XMM0)); PUNPCKLWD(fpScratchReg3, R(XMM0)); PUNPCKLWD(fpScratchReg4, R(XMM0)); } CVTDQ2PS(fpScratchReg1, R(fpScratchReg1)); CVTDQ2PS(fpScratchReg2, R(fpScratchReg2)); CVTDQ2PS(fpScratchReg3, R(fpScratchReg3)); CVTDQ2PS(fpScratchReg4, R(fpScratchReg4)); // Okay, now multiply the R sides by frac_u, and L by (256 - frac_u)... MOVD_xmm(fpScratchReg5, MDisp(RSP, 24)); CVTDQ2PS(fpScratchReg5, R(fpScratchReg5)); SHUFPS(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0)); if (RipAccessible(by256)) { MULPS(fpScratchReg5, M(by256)); // rip accessible } else { Crash(); // TODO } MOVAPS(XMM0, M(ones)); SUBPS(XMM0, R(fpScratchReg5)); MULPS(fpScratchReg1, R(XMM0)); MULPS(fpScratchReg2, R(fpScratchReg5)); MULPS(fpScratchReg3, R(XMM0)); MULPS(fpScratchReg4, R(fpScratchReg5)); // Now set top=fpScratchReg1, bottom=fpScratchReg3. ADDPS(fpScratchReg1, R(fpScratchReg2)); ADDPS(fpScratchReg3, R(fpScratchReg4)); // Next, time for frac_v. MOVD_xmm(fpScratchReg5, MDisp(RSP, 32)); CVTDQ2PS(fpScratchReg5, R(fpScratchReg5)); SHUFPS(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0)); MULPS(fpScratchReg5, M(by256)); MOVAPS(XMM0, M(ones)); SUBPS(XMM0, R(fpScratchReg5)); MULPS(fpScratchReg1, R(XMM0)); MULPS(fpScratchReg3, R(fpScratchReg5)); // Still at the 255 scale, now we're interpolated. ADDPS(fpScratchReg1, R(fpScratchReg3)); // Time to convert back to a single 32 bit value. CVTPS2DQ(fpScratchReg1, R(fpScratchReg1)); PACKSSDW(fpScratchReg1, R(fpScratchReg1)); PACKUSWB(fpScratchReg1, R(fpScratchReg1)); MOVD_xmm(R(resultReg), fpScratchReg1); if (id.hasInvalidPtr) { SetJumpTarget(zeroSrc); } ADD(64, R(RSP), Imm8(24)); POP(arg3Reg); POP(arg4Reg); POP(R12); POP(R13); POP(R14); POP(R15); RET(); EndWrite(); return (LinearFunc)start; }
void Jit64::stfd(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStoreFloatingOff); FALLBACK_IF(js.memcheck || !inst.RA); int s = inst.RS; int a = inst.RA; u32 mem_mask = Memory::ADDR_MASK_HW_ACCESS; if (Core::g_CoreStartupParameter.bMMU || Core::g_CoreStartupParameter.bTLBHack) { mem_mask |= Memory::ADDR_MASK_MEM1; } #ifdef ENABLE_MEM_CHECK if (Core::g_CoreStartupParameter.bEnableDebugging) { mem_mask |= Memory::EXRAM_MASK; } #endif gpr.FlushLockX(ABI_PARAM1); gpr.Lock(a); fpr.Lock(s); gpr.BindToRegister(a, true, false); s32 offset = (s32)(s16)inst.SIMM_16; LEA(32, ABI_PARAM1, MDisp(gpr.R(a).GetSimpleReg(), offset)); TEST(32, R(ABI_PARAM1), Imm32(mem_mask)); FixupBranch safe = J_CC(CC_NZ); // Fast routine if (cpu_info.bSSSE3) { MOVAPD(XMM0, fpr.R(s)); PSHUFB(XMM0, M((void*)bswapShuffle1x8)); #if _M_X86_64 MOVQ_xmm(MComplex(RBX, ABI_PARAM1, SCALE_1, 0), XMM0); #else AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK)); MOVQ_xmm(MDisp(ABI_PARAM1, (u32)Memory::base), XMM0); #endif } else { MOVAPD(XMM0, fpr.R(s)); MOVD_xmm(R(EAX), XMM0); UnsafeWriteRegToReg(EAX, ABI_PARAM1, 32, 4); PSRLQ(XMM0, 32); MOVD_xmm(R(EAX), XMM0); UnsafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0); } FixupBranch exit = J(true); SetJumpTarget(safe); // Safe but slow routine MOVAPD(XMM0, fpr.R(s)); PSRLQ(XMM0, 32); MOVD_xmm(R(EAX), XMM0); SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0, RegistersInUse() | (1 << (16 + XMM0))); MOVAPD(XMM0, fpr.R(s)); MOVD_xmm(R(EAX), XMM0); LEA(32, ABI_PARAM1, MDisp(gpr.R(a).GetSimpleReg(), offset)); SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 4, RegistersInUse()); SetJumpTarget(exit); gpr.UnlockAll(); gpr.UnlockAllX(); fpr.UnlockAll(); }