Beispiel #1
0
void Jit::Comp_mxc1(MIPSOpcode op)
{
	CONDITIONAL_DISABLE;

	int fs = _FS;
	MIPSGPReg rt = _RT;

	switch((op >> 21) & 0x1f) 
	{
	case 0: // R(rt) = FI(fs); break; //mfc1
		if (rt != MIPS_REG_ZERO) {
			fpr.MapReg(fs, true, false);  // TODO: Seems the V register becomes dirty here? It shouldn't.
			gpr.MapReg(rt, false, true);
			MOVD_xmm(gpr.R(rt), fpr.RX(fs));
		}
		break;

	case 2: // R(rt) = currentMIPS->ReadFCR(fs); break; //cfc1
		Comp_Generic(op);
		return;

	case 4: //FI(fs) = R(rt);	break; //mtc1
		gpr.MapReg(rt, true, false);
		fpr.MapReg(fs, false, true);
		MOVD_xmm(fpr.RX(fs), gpr.R(rt));
		return;

	case 6: //currentMIPS->WriteFCR(fs, R(rt)); break; //ctc1
		Comp_Generic(op);
		return;
	}
}
Beispiel #2
0
bool SamplerJitCache::Jit_Decode4444() {
	MOVD_xmm(fpScratchReg1, R(resultReg));
	PUNPCKLBW(fpScratchReg1, R(fpScratchReg1));
	if (RipAccessible(color4444mask)) {
		PAND(fpScratchReg1, M(color4444mask));  // rip accessible
	} else {
		Crash();
	}
	MOVSS(fpScratchReg2, R(fpScratchReg1));
	MOVSS(fpScratchReg3, R(fpScratchReg1));
	PSRLW(fpScratchReg2, 4);
	PSLLW(fpScratchReg3, 4);
	POR(fpScratchReg1, R(fpScratchReg2));
	POR(fpScratchReg1, R(fpScratchReg3));
	MOVD_xmm(R(resultReg), fpScratchReg1);
	return true;
}
Beispiel #3
0
void Jit::CompFPComp(int lhs, int rhs, u8 compare, bool allowNaN)
{
	gpr.MapReg(MIPS_REG_FPCOND, false, true);
	MOVSS(XMM0, fpr.R(lhs));
	CMPSS(XMM0, fpr.R(rhs), compare);
	MOVD_xmm(gpr.R(MIPS_REG_FPCOND), XMM0);

	// This means that NaN also means true, e.g. !<> or !>, etc.
	if (allowNaN)
	{
		MOVSS(XMM0, fpr.R(lhs));
		CMPUNORDSS(XMM0, fpr.R(rhs));

		MOVD_xmm(R(EAX), XMM0);
		OR(32, gpr.R(MIPS_REG_FPCOND), R(EAX));
	}
}
Beispiel #4
0
void Jit::Comp_FPULS(MIPSOpcode op)
{
	CONDITIONAL_DISABLE;
	s32 offset = _IMM16;
	int ft = _FT;
	MIPSGPReg rs = _RS;

	switch(op >> 26)
	{
	case 49: //FI(ft) = Memory::Read_U32(addr); break; //lwc1
		{
			gpr.Lock(rs);
			fpr.SpillLock(ft);
			fpr.MapReg(ft, false, true);

			JitSafeMem safe(this, rs, offset);
			OpArg src;
			if (safe.PrepareRead(src, 4))
				MOVSS(fpr.RX(ft), src);
			if (safe.PrepareSlowRead(safeMemFuncs.readU32))
				MOVD_xmm(fpr.RX(ft), R(EAX));
			safe.Finish();

			gpr.UnlockAll();
			fpr.ReleaseSpillLocks();
		}
		break;
	case 57: //Memory::Write_U32(FI(ft), addr); break; //swc1
		{
			gpr.Lock(rs);
			fpr.SpillLock(ft);
			fpr.MapReg(ft, true, false);

			JitSafeMem safe(this, rs, offset);
			OpArg dest;
			if (safe.PrepareWrite(dest, 4))
				MOVSS(dest, fpr.RX(ft));
			if (safe.PrepareSlowWrite())
			{
				MOVSS(M(&ssLoadStoreTemp), fpr.RX(ft));
				safe.DoSlowWrite(safeMemFuncs.writeU32, M(&ssLoadStoreTemp));
			}
			safe.Finish();

			gpr.UnlockAll();
			fpr.ReleaseSpillLocks();
		}
		break;

	default:
		_dbg_assert_msg_(CPU,0,"Trying to interpret FPULS instruction that can't be interpreted");
		break;
	}
}
Beispiel #5
0
void Jit::CompFPComp(int lhs, int rhs, u8 compare, bool allowNaN) {
	gpr.MapReg(MIPS_REG_FPCOND, false, true);

	// This means that NaN also means true, e.g. !<> or !>, etc.
	if (allowNaN) {
		CopyFPReg(XMM0, fpr.R(lhs));
		CopyFPReg(XMM1, fpr.R(lhs));
		CMPSS(XMM0, fpr.R(rhs), compare);
		CMPUNORDSS(XMM1, fpr.R(rhs));

		POR(XMM0, R(XMM1));
	} else {
		CopyFPReg(XMM0, fpr.R(lhs));
		CMPSS(XMM0, fpr.R(rhs), compare);
	}

	MOVD_xmm(gpr.R(MIPS_REG_FPCOND), XMM0);
}
Beispiel #6
0
void Jit::Comp_FPU2op(MIPSOpcode op) {
	CONDITIONAL_DISABLE;
	
	int fs = _FS;
	int fd = _FD;

	switch (op & 0x3f) 
	{
	case 5:	//F(fd)	= fabsf(F(fs)); break; //abs
		fpr.SpillLock(fd, fs);
		fpr.MapReg(fd, fd == fs, true);
		MOVSS(fpr.RX(fd), fpr.R(fs));
		PAND(fpr.RX(fd), M(ssNoSignMask));
		break;

	case 6:	//F(fd)	= F(fs);				break; //mov
		if (fd != fs) {
			fpr.SpillLock(fd, fs);
			fpr.MapReg(fd, fd == fs, true);
			MOVSS(fpr.RX(fd), fpr.R(fs));
		}
		break;

	case 7:	//F(fd)	= -F(fs);			 break; //neg
		fpr.SpillLock(fd, fs);
		fpr.MapReg(fd, fd == fs, true);
		MOVSS(fpr.RX(fd), fpr.R(fs));
		PXOR(fpr.RX(fd), M(ssSignBits2));
		break;


	case 4:	//F(fd)	= sqrtf(F(fs)); break; //sqrt
		fpr.SpillLock(fd, fs); // this probably works, just badly tested
		fpr.MapReg(fd, fd == fs, true);
		SQRTSS(fpr.RX(fd), fpr.R(fs));
		break;

	case 13: //FsI(fd) = F(fs)>=0 ? (int)floorf(F(fs)) : (int)ceilf(F(fs)); break;//trunc.w.s
		{
			fpr.SpillLock(fs, fd);
			fpr.StoreFromRegister(fd);
			CVTTSS2SI(EAX, fpr.R(fs));

			// Did we get an indefinite integer value?
			CMP(32, R(EAX), Imm32(0x80000000));
			FixupBranch skip = J_CC(CC_NE);
			MOVSS(XMM0, fpr.R(fs));
			XORPS(XMM1, R(XMM1));
			CMPSS(XMM0, R(XMM1), CMP_LT);

			// At this point, -inf = 0xffffffff, inf/nan = 0x00000000.
			// We want -inf to be 0x80000000 inf/nan to be 0x7fffffff, so we flip those bits.
			MOVD_xmm(R(EAX), XMM0);
			XOR(32, R(EAX), Imm32(0x7fffffff));

			SetJumpTarget(skip);
			MOV(32, fpr.R(fd), R(EAX));
		}
		break;

	case 32: //F(fd)	= (float)FsI(fs);			break; //cvt.s.w
		// Store to memory so we can read it as an integer value.
		fpr.StoreFromRegister(fs);
		CVTSI2SS(XMM0, fpr.R(fs));
		MOVSS(fpr.R(fd), XMM0);
		break;

	case 12: //FsI(fd) = (int)floorf(F(fs)+0.5f); break; //round.w.s
	case 14: //FsI(fd) = (int)ceilf (F(fs)); break; //ceil.w.s
	case 15: //FsI(fd) = (int)floorf(F(fs)); break; //floor.w.s
	case 36: //FsI(fd) = (int)	F(fs);			 break; //cvt.w.s
	default:
		DISABLE;
		return;
	}
	fpr.ReleaseSpillLocks();
}
Beispiel #7
0
void Jit::Comp_mxc1(MIPSOpcode op)
{
	CONDITIONAL_DISABLE;

	int fs = _FS;
	MIPSGPReg rt = _RT;

	switch((op >> 21) & 0x1f) 
	{
	case 0: // R(rt) = FI(fs); break; //mfc1
		if (rt != MIPS_REG_ZERO) {
			fpr.MapReg(fs, true, false);  // TODO: Seems the V register becomes dirty here? It shouldn't.
			gpr.MapReg(rt, false, true);
			MOVD_xmm(gpr.R(rt), fpr.RX(fs));
		}
		break;

	case 2: // R(rt) = currentMIPS->ReadFCR(fs); break; //cfc1
		if (fs == 31) {
			bool wasImm = gpr.IsImm(MIPS_REG_FPCOND);
			if (!wasImm) {
				gpr.Lock(rt, MIPS_REG_FPCOND);
				gpr.MapReg(MIPS_REG_FPCOND, true, false);
			}
			gpr.MapReg(rt, false, true);
			MOV(32, gpr.R(rt), M(&mips_->fcr31));
			if (wasImm) {
				if (gpr.GetImm(MIPS_REG_FPCOND) & 1) {
					OR(32, gpr.R(rt), Imm32(1 << 23));
				} else {
					AND(32, gpr.R(rt), Imm32(~(1 << 23)));
				}
			} else {
				AND(32, gpr.R(rt), Imm32(~(1 << 23)));
				MOV(32, R(EAX), gpr.R(MIPS_REG_FPCOND));
				AND(32, R(EAX), Imm32(1));
				SHL(32, R(EAX), Imm8(23));
				OR(32, gpr.R(rt), R(EAX));
			}
			gpr.UnlockAll();
		} else if (fs == 0) {
			gpr.SetImm(rt, MIPSState::FCR0_VALUE);
		} else {
			Comp_Generic(op);
		}
		return;

	case 4: //FI(fs) = R(rt);	break; //mtc1
		gpr.MapReg(rt, true, false);
		fpr.MapReg(fs, false, true);
		MOVD_xmm(fpr.RX(fs), gpr.R(rt));
		return;

	case 6: //currentMIPS->WriteFCR(fs, R(rt)); break; //ctc1
		if (fs == 31) {
			if (gpr.IsImm(rt)) {
				gpr.SetImm(MIPS_REG_FPCOND, (gpr.GetImm(rt) >> 23) & 1);
				MOV(32, M(&mips_->fcr31), Imm32(gpr.GetImm(rt) & 0x0181FFFF));
			} else {
				gpr.Lock(rt, MIPS_REG_FPCOND);
				gpr.MapReg(rt, true, false);
				gpr.MapReg(MIPS_REG_FPCOND, false, true);
				MOV(32, gpr.R(MIPS_REG_FPCOND), gpr.R(rt));
				SHR(32, gpr.R(MIPS_REG_FPCOND), Imm8(23));
				AND(32, gpr.R(MIPS_REG_FPCOND), Imm32(1));
				MOV(32, M(&mips_->fcr31), gpr.R(rt));
				AND(32, M(&mips_->fcr31), Imm32(0x0181FFFF));
				gpr.UnlockAll();
			}
		} else {
Beispiel #8
0
void Jit::Comp_mxc1(MIPSOpcode op) {
	CONDITIONAL_DISABLE(FPU_XFER);

	int fs = _FS;
	MIPSGPReg rt = _RT;

	switch ((op >> 21) & 0x1f) {
	case 0: // R(rt) = FI(fs); break; //mfc1
		if (rt == MIPS_REG_ZERO)
			return;
		gpr.MapReg(rt, false, true);
		// If fs is not mapped, most likely it's being abandoned.
		// Just load from memory in that case.
		if (fpr.R(fs).IsSimpleReg()) {
			MOVD_xmm(gpr.R(rt), fpr.RX(fs));
		} else {
			MOV(32, gpr.R(rt), fpr.R(fs));
		}
		break;

	case 2: // R(rt) = currentMIPS->ReadFCR(fs); break; //cfc1
		if (rt == MIPS_REG_ZERO)
			return;
		if (fs == 31) {
			bool wasImm = gpr.IsImm(MIPS_REG_FPCOND);
			if (!wasImm) {
				gpr.Lock(rt, MIPS_REG_FPCOND);
				gpr.MapReg(MIPS_REG_FPCOND, true, false);
			}
			gpr.MapReg(rt, false, true);
			MOV(32, gpr.R(rt), MIPSSTATE_VAR(fcr31));
			if (wasImm) {
				if (gpr.GetImm(MIPS_REG_FPCOND) & 1) {
					OR(32, gpr.R(rt), Imm32(1 << 23));
				} else {
					AND(32, gpr.R(rt), Imm32(~(1 << 23)));
				}
			} else {
				AND(32, gpr.R(rt), Imm32(~(1 << 23)));
				MOV(32, R(TEMPREG), gpr.R(MIPS_REG_FPCOND));
				AND(32, R(TEMPREG), Imm32(1));
				SHL(32, R(TEMPREG), Imm8(23));
				OR(32, gpr.R(rt), R(TEMPREG));
			}
			gpr.UnlockAll();
		} else if (fs == 0) {
			gpr.SetImm(rt, MIPSState::FCR0_VALUE);
		} else {
			Comp_Generic(op);
		}
		return;

	case 4: //FI(fs) = R(rt);	break; //mtc1
		fpr.MapReg(fs, false, true);
		if (gpr.IsImm(rt) && gpr.GetImm(rt) == 0) {
			XORPS(fpr.RX(fs), fpr.R(fs));
		} else {
			gpr.KillImmediate(rt, true, false);
			MOVD_xmm(fpr.RX(fs), gpr.R(rt));
		}
		return;

	case 6: //currentMIPS->WriteFCR(fs, R(rt)); break; //ctc1
		if (fs == 31) {
			// Must clear before setting, since ApplyRoundingMode() assumes it was cleared.
			RestoreRoundingMode();
			if (gpr.IsImm(rt)) {
				gpr.SetImm(MIPS_REG_FPCOND, (gpr.GetImm(rt) >> 23) & 1);
				MOV(32, MIPSSTATE_VAR(fcr31), Imm32(gpr.GetImm(rt) & 0x0181FFFF));
				if ((gpr.GetImm(rt) & 0x1000003) == 0) {
					// Default nearest / no-flush mode, just leave it cleared.
				} else {
					UpdateRoundingMode(gpr.GetImm(rt));
					ApplyRoundingMode();
				}
			} else {
Beispiel #9
0
void Jit::Comp_FPU2op(MIPSOpcode op) {
	CONDITIONAL_DISABLE(FPU);
	
	int fs = _FS;
	int fd = _FD;

	auto execRounding = [&](void (XEmitter::*conv)(X64Reg, OpArg), int setMXCSR) {
		fpr.SpillLock(fd, fs);
		fpr.MapReg(fd, fs == fd, true);

		// Small optimization: 0 is our default mode anyway.
		if (setMXCSR == 0 && !js.hasSetRounding) {
			setMXCSR = -1;
		}
		if (setMXCSR != -1) {
			STMXCSR(MIPSSTATE_VAR(mxcsrTemp));
			MOV(32, R(TEMPREG), MIPSSTATE_VAR(mxcsrTemp));
			AND(32, R(TEMPREG), Imm32(~(3 << 13)));
			OR(32, R(TEMPREG), Imm32(setMXCSR << 13));
			MOV(32, MIPSSTATE_VAR(temp), R(TEMPREG));
			LDMXCSR(MIPSSTATE_VAR(temp));
		}

		(this->*conv)(TEMPREG, fpr.R(fs));

		// Did we get an indefinite integer value?
		CMP(32, R(TEMPREG), Imm32(0x80000000));
		FixupBranch skip = J_CC(CC_NE);
		if (fd != fs) {
			CopyFPReg(fpr.RX(fd), fpr.R(fs));
		}
		XORPS(XMM1, R(XMM1));
		CMPSS(fpr.RX(fd), R(XMM1), CMP_LT);

		// At this point, -inf = 0xffffffff, inf/nan = 0x00000000.
		// We want -inf to be 0x80000000 inf/nan to be 0x7fffffff, so we flip those bits.
		MOVD_xmm(R(TEMPREG), fpr.RX(fd));
		XOR(32, R(TEMPREG), Imm32(0x7fffffff));

		SetJumpTarget(skip);
		MOVD_xmm(fpr.RX(fd), R(TEMPREG));

		if (setMXCSR != -1) {
			LDMXCSR(MIPSSTATE_VAR(mxcsrTemp));
		}
	};

	switch (op & 0x3f) {
	case 5:	//F(fd)	= fabsf(F(fs)); break; //abs
		fpr.SpillLock(fd, fs);
		fpr.MapReg(fd, fd == fs, true);
		MOV(PTRBITS, R(TEMPREG), ImmPtr(&ssNoSignMask[0]));
		if (fd != fs && fpr.IsMapped(fs)) {
			MOVAPS(fpr.RX(fd), MatR(TEMPREG));
			ANDPS(fpr.RX(fd), fpr.R(fs));
		} else {
			if (fd != fs) {
				MOVSS(fpr.RX(fd), fpr.R(fs));
			}
			ANDPS(fpr.RX(fd), MatR(TEMPREG));
		}
		break;

	case 6:	//F(fd)	= F(fs);				break; //mov
		if (fd != fs) {
			fpr.SpillLock(fd, fs);
			fpr.MapReg(fd, fd == fs, true);
			CopyFPReg(fpr.RX(fd), fpr.R(fs));
		}
		break;

	case 7:	//F(fd)	= -F(fs);			 break; //neg
		fpr.SpillLock(fd, fs);
		fpr.MapReg(fd, fd == fs, true);
		MOV(PTRBITS, R(TEMPREG), ImmPtr(&ssSignBits2[0]));
		if (fd != fs && fpr.IsMapped(fs)) {
			MOVAPS(fpr.RX(fd), MatR(TEMPREG));
			XORPS(fpr.RX(fd), fpr.R(fs));
		} else {
			if (fd != fs) {
				MOVSS(fpr.RX(fd), fpr.R(fs));
			}
			XORPS(fpr.RX(fd), MatR(TEMPREG));
		}
		break;

	case 4:	//F(fd)	= sqrtf(F(fs)); break; //sqrt
		fpr.SpillLock(fd, fs);
		fpr.MapReg(fd, fd == fs, true);
		SQRTSS(fpr.RX(fd), fpr.R(fs));
		break;

	case 13: //FsI(fd) = F(fs)>=0 ? (int)floorf(F(fs)) : (int)ceilf(F(fs)); break; //trunc.w.s
		execRounding(&XEmitter::CVTTSS2SI, -1);
		break;

	case 32: //F(fd)	= (float)FsI(fs);			break; //cvt.s.w
		fpr.SpillLock(fd, fs);
		fpr.MapReg(fd, fs == fd, true);
		if (fpr.IsMapped(fs)) {
			CVTDQ2PS(fpr.RX(fd), fpr.R(fs));
		} else {
			// If fs was fd, we'd be in the case above since we mapped fd.
			MOVSS(fpr.RX(fd), fpr.R(fs));
			CVTDQ2PS(fpr.RX(fd), fpr.R(fd));
		}
		break;

	case 36: //FsI(fd) = (int)	F(fs);			 break; //cvt.w.s
		// Uses the current rounding mode.
		execRounding(&XEmitter::CVTSS2SI, -1);
		break;

	case 12: //FsI(fd) = (int)floorf(F(fs)+0.5f); break; //round.w.s
		execRounding(&XEmitter::CVTSS2SI, 0);
		break;
	case 14: //FsI(fd) = (int)ceilf (F(fs)); break; //ceil.w.s
		execRounding(&XEmitter::CVTSS2SI, 2);
		break;
	case 15: //FsI(fd) = (int)floorf(F(fs)); break; //floor.w.s
		execRounding(&XEmitter::CVTSS2SI, 1);
		break;
	default:
		DISABLE;
		return;
	}
	fpr.ReleaseSpillLocks();
}
Beispiel #10
0
LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
	_assert_msg_(G3D, id.linear, "Linear should be set on sampler id");
	BeginWrite();

	// We'll first write the nearest sampler, which we will CALL.
	// This may differ slightly based on the "linear" flag.
	const u8 *nearest = AlignCode16();

	if (!Jit_ReadTextureFormat(id)) {
		EndWrite();
		SetCodePtr(const_cast<u8 *>(nearest));
		return nullptr;
	}

	RET();

	// Now the actual linear func, which is exposed externally.
	const u8 *start = AlignCode16();

	// NOTE: This doesn't use the general register mapping.
	// POSIX: arg1=uptr, arg2=vptr, arg3=frac_u, arg4=frac_v, arg5=src, arg6=bufw, stack+8=level
	// Win64: arg1=uptr, arg2=vptr, arg3=frac_u, arg4=frac_v, stack+40=src, stack+48=bufw, stack+56=level
	//
	// We map these to nearest CALLs, with order: u, v, src, bufw, level

	// Let's start by saving a bunch of registers.
	PUSH(R15);
	PUSH(R14);
	PUSH(R13);
	PUSH(R12);
	// Won't need frac_u/frac_v for a while.
	PUSH(arg4Reg);
	PUSH(arg3Reg);
	// Extra space to restore alignment and save resultReg for lerp.
	// TODO: Maybe use XMMs instead?
	SUB(64, R(RSP), Imm8(24));

	MOV(64, R(R12), R(arg1Reg));
	MOV(64, R(R13), R(arg2Reg));
#ifdef _WIN32
	// First arg now starts at 24 (extra space) + 48 (pushed stack) + 8 (ret address) + 32 (shadow space)
	const int argOffset = 24 + 48 + 8 + 32;
	MOV(64, R(R14), MDisp(RSP, argOffset));
	MOV(32, R(R15), MDisp(RSP, argOffset + 8));
	// level is at argOffset + 16.
#else
	MOV(64, R(R14), R(arg5Reg));
	MOV(32, R(R15), R(arg6Reg));
	// level is at 24 + 48 + 8.
#endif

	// Early exit on !srcPtr.
	FixupBranch zeroSrc;
	if (id.hasInvalidPtr) {
		CMP(PTRBITS, R(R14), Imm8(0));
		FixupBranch nonZeroSrc = J_CC(CC_NZ);
		XOR(32, R(RAX), R(RAX));
		zeroSrc = J(true);
		SetJumpTarget(nonZeroSrc);
	}

	// At this point:
	// R12=uptr, R13=vptr, stack+24=frac_u, stack+32=frac_v, R14=src, R15=bufw, stack+X=level

	auto doNearestCall = [&](int off) {
		MOV(32, R(uReg), MDisp(R12, off));
		MOV(32, R(vReg), MDisp(R13, off));
		MOV(64, R(srcReg), R(R14));
		MOV(32, R(bufwReg), R(R15));
		// Leave level, we just always load from RAM.  Separate CLUTs is uncommon.

		CALL(nearest);
		MOV(32, MDisp(RSP, off), R(resultReg));
	};

	doNearestCall(0);
	doNearestCall(4);
	doNearestCall(8);
	doNearestCall(12);

	// Convert TL, TR, BL, BR to floats for easier blending.
	if (!cpu_info.bSSE4_1) {
		PXOR(XMM0, R(XMM0));
	}

	MOVD_xmm(fpScratchReg1, MDisp(RSP, 0));
	MOVD_xmm(fpScratchReg2, MDisp(RSP, 4));
	MOVD_xmm(fpScratchReg3, MDisp(RSP, 8));
	MOVD_xmm(fpScratchReg4, MDisp(RSP, 12));

	if (cpu_info.bSSE4_1) {
		PMOVZXBD(fpScratchReg1, R(fpScratchReg1));
		PMOVZXBD(fpScratchReg2, R(fpScratchReg2));
		PMOVZXBD(fpScratchReg3, R(fpScratchReg3));
		PMOVZXBD(fpScratchReg4, R(fpScratchReg4));
	} else {
		PUNPCKLBW(fpScratchReg1, R(XMM0));
		PUNPCKLBW(fpScratchReg2, R(XMM0));
		PUNPCKLBW(fpScratchReg3, R(XMM0));
		PUNPCKLBW(fpScratchReg4, R(XMM0));
		PUNPCKLWD(fpScratchReg1, R(XMM0));
		PUNPCKLWD(fpScratchReg2, R(XMM0));
		PUNPCKLWD(fpScratchReg3, R(XMM0));
		PUNPCKLWD(fpScratchReg4, R(XMM0));
	}
	CVTDQ2PS(fpScratchReg1, R(fpScratchReg1));
	CVTDQ2PS(fpScratchReg2, R(fpScratchReg2));
	CVTDQ2PS(fpScratchReg3, R(fpScratchReg3));
	CVTDQ2PS(fpScratchReg4, R(fpScratchReg4));

	// Okay, now multiply the R sides by frac_u, and L by (256 - frac_u)...
	MOVD_xmm(fpScratchReg5, MDisp(RSP, 24));
	CVTDQ2PS(fpScratchReg5, R(fpScratchReg5));
	SHUFPS(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0));
	if (RipAccessible(by256)) {
		MULPS(fpScratchReg5, M(by256));  // rip accessible
	} else {
		Crash();  // TODO
	}
	MOVAPS(XMM0, M(ones));
	SUBPS(XMM0, R(fpScratchReg5));

	MULPS(fpScratchReg1, R(XMM0));
	MULPS(fpScratchReg2, R(fpScratchReg5));
	MULPS(fpScratchReg3, R(XMM0));
	MULPS(fpScratchReg4, R(fpScratchReg5));

	// Now set top=fpScratchReg1, bottom=fpScratchReg3.
	ADDPS(fpScratchReg1, R(fpScratchReg2));
	ADDPS(fpScratchReg3, R(fpScratchReg4));

	// Next, time for frac_v.
	MOVD_xmm(fpScratchReg5, MDisp(RSP, 32));
	CVTDQ2PS(fpScratchReg5, R(fpScratchReg5));
	SHUFPS(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0));
	MULPS(fpScratchReg5, M(by256));
	MOVAPS(XMM0, M(ones));
	SUBPS(XMM0, R(fpScratchReg5));

	MULPS(fpScratchReg1, R(XMM0));
	MULPS(fpScratchReg3, R(fpScratchReg5));

	// Still at the 255 scale, now we're interpolated.
	ADDPS(fpScratchReg1, R(fpScratchReg3));

	// Time to convert back to a single 32 bit value.
	CVTPS2DQ(fpScratchReg1, R(fpScratchReg1));
	PACKSSDW(fpScratchReg1, R(fpScratchReg1));
	PACKUSWB(fpScratchReg1, R(fpScratchReg1));
	MOVD_xmm(R(resultReg), fpScratchReg1);

	if (id.hasInvalidPtr) {
		SetJumpTarget(zeroSrc);
	}

	ADD(64, R(RSP), Imm8(24));
	POP(arg3Reg);
	POP(arg4Reg);
	POP(R12);
	POP(R13);
	POP(R14);
	POP(R15);

	RET();

	EndWrite();
	return (LinearFunc)start;
}
void Jit64::stfd(UGeckoInstruction inst)
{
	INSTRUCTION_START
	JITDISABLE(bJITLoadStoreFloatingOff);
	FALLBACK_IF(js.memcheck || !inst.RA);

	int s = inst.RS;
	int a = inst.RA;

	u32 mem_mask = Memory::ADDR_MASK_HW_ACCESS;
	if (Core::g_CoreStartupParameter.bMMU ||
		Core::g_CoreStartupParameter.bTLBHack) {
			mem_mask |= Memory::ADDR_MASK_MEM1;
	}
#ifdef ENABLE_MEM_CHECK
	if (Core::g_CoreStartupParameter.bEnableDebugging)
	{
		mem_mask |= Memory::EXRAM_MASK;
	}
#endif

	gpr.FlushLockX(ABI_PARAM1);
	gpr.Lock(a);
	fpr.Lock(s);
	gpr.BindToRegister(a, true, false);

	s32 offset = (s32)(s16)inst.SIMM_16;
	LEA(32, ABI_PARAM1, MDisp(gpr.R(a).GetSimpleReg(), offset));
	TEST(32, R(ABI_PARAM1), Imm32(mem_mask));
	FixupBranch safe = J_CC(CC_NZ);

	// Fast routine
	if (cpu_info.bSSSE3) {
		MOVAPD(XMM0, fpr.R(s));
		PSHUFB(XMM0, M((void*)bswapShuffle1x8));
#if _M_X86_64
		MOVQ_xmm(MComplex(RBX, ABI_PARAM1, SCALE_1, 0), XMM0);
#else
		AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
		MOVQ_xmm(MDisp(ABI_PARAM1, (u32)Memory::base), XMM0);
#endif
	} else {
		MOVAPD(XMM0, fpr.R(s));
		MOVD_xmm(R(EAX), XMM0);
		UnsafeWriteRegToReg(EAX, ABI_PARAM1, 32, 4);

		PSRLQ(XMM0, 32);
		MOVD_xmm(R(EAX), XMM0);
		UnsafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0);
	}
	FixupBranch exit = J(true);
	SetJumpTarget(safe);

	// Safe but slow routine
	MOVAPD(XMM0, fpr.R(s));
	PSRLQ(XMM0, 32);
	MOVD_xmm(R(EAX), XMM0);
	SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0, RegistersInUse() | (1 << (16 + XMM0)));

	MOVAPD(XMM0, fpr.R(s));
	MOVD_xmm(R(EAX), XMM0);
	LEA(32, ABI_PARAM1, MDisp(gpr.R(a).GetSimpleReg(), offset));
	SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 4, RegistersInUse());

	SetJumpTarget(exit);

	gpr.UnlockAll();
	gpr.UnlockAllX();
	fpr.UnlockAll();
}