Ejemplo n.º 1
0
// Zero cache line.
void JitILBase::dcbz(UGeckoInstruction inst)
{
	FALLBACK_IF(true);

	// TODO!
#if 0
	if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreOff)
	{
		Default(inst);
		return;
	}
	INSTRUCTION_START;
		MOV(32, R(EAX), gpr.R(inst.RB));
	if (inst.RA)
		ADD(32, R(EAX), gpr.R(inst.RA));
	AND(32, R(EAX), Imm32(~31));
	PXOR(XMM0, R(XMM0));
#if _M_X86_64
	MOVAPS(MComplex(EBX, EAX, SCALE_1, 0), XMM0);
	MOVAPS(MComplex(EBX, EAX, SCALE_1, 16), XMM0);
#else
	AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
	MOVAPS(MDisp(EAX, (u32)Memory::base), XMM0);
	MOVAPS(MDisp(EAX, (u32)Memory::base + 16), XMM0);
#endif
#endif
}
Ejemplo n.º 2
0
bool SamplerJitCache::Jit_GetTexData(const SamplerID &id, int bitsPerTexel) {
	if (id.swizzle) {
		return Jit_GetTexDataSwizzled(id, bitsPerTexel);
	}

	// srcReg might be EDX, so let's copy that before we multiply.
	switch (bitsPerTexel) {
	case 32:
	case 16:
	case 8:
		LEA(64, tempReg1, MComplex(srcReg, uReg, bitsPerTexel / 8, 0));
		break;

	case 4: {
		XOR(32, R(tempReg2), R(tempReg2));
		SHR(32, R(uReg), Imm8(1));
		FixupBranch skip = J_CC(CC_NC);
		// Track whether we shifted a 1 off or not.
		MOV(32, R(tempReg2), Imm32(4));
		SetJumpTarget(skip);
		LEA(64, tempReg1, MRegSum(srcReg, uReg));
		break;
	}

	default:
		return false;
	}

	MOV(32, R(EAX), R(vReg));
	MUL(32, R(bufwReg));

	switch (bitsPerTexel) {
	case 32:
	case 16:
	case 8:
		MOVZX(32, bitsPerTexel, resultReg, MComplex(tempReg1, RAX, bitsPerTexel / 8, 0));
		break;

	case 4: {
		SHR(32, R(RAX), Imm8(1));
		MOV(8, R(resultReg), MRegSum(tempReg1, RAX));
		// RCX is now free.
		MOV(8, R(RCX), R(tempReg2));
		SHR(8, R(resultReg), R(RCX));
		// Zero out any bits not shifted off.
		AND(32, R(resultReg), Imm8(0x0F));
		break;
	}

	default:
		return false;
	}

	return true;
}
Ejemplo n.º 3
0
void Jit::Comp_FPULS(u32 op)
{
	CONDITIONAL_DISABLE;
	if (!g_Config.bFastMemory) {
		DISABLE;
	}


	s32 offset = (s16)(op&0xFFFF);
	int ft = ((op>>16)&0x1f);
	int rs = _RS;
	// u32 addr = R(rs) + offset;

	switch(op >> 26)
	{
	case 49: //FI(ft) = Memory::Read_U32(addr); break; //lwc1
		gpr.Lock(rs);
		fpr.Lock(ft);
		fpr.BindToRegister(ft, false, true);
#ifdef _M_IX86
		MOV(32, R(EAX), gpr.R(rs));
		AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
		MOVSS(fpr.RX(ft), MDisp(EAX, (u32)Memory::base + offset));
#else
		MOV(32, R(EAX), gpr.R(rs));
		MOVSS(fpr.RX(ft), MComplex(RBX, RAX, SCALE_1, offset));
#endif
		gpr.UnlockAll();
		fpr.UnlockAll();
		break;
	case 57: //Memory::Write_U32(FI(ft), addr); break; //swc1
		gpr.Lock(rs);
		fpr.Lock(ft);
		fpr.BindToRegister(ft, true, false);
#ifdef _M_IX86
		MOV(32, R(EAX), gpr.R(rs));
		AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
		MOVSS(MDisp(EAX, (u32)Memory::base + offset), fpr.RX(ft));
#else
		MOV(32, R(EAX), gpr.R(rs));
		MOVSS(MComplex(RBX, RAX, SCALE_1, offset), fpr.RX(ft));
#endif
		gpr.UnlockAll();
		fpr.UnlockAll();
		break;

	default:
		_dbg_assert_msg_(CPU,0,"Trying to interpret FPULS instruction that can't be interpreted");
		break;
	}
}
Ejemplo n.º 4
0
bool SamplerJitCache::Jit_ReadClutColor(const SamplerID &id) {
	if (!id.useSharedClut) {
		// TODO: Need to load from RAM, always.
		if (id.linear) {
#ifdef _WIN32
			const int argOffset = 24 + 48 + 8 + 32;
			// Extra 8 to account for CALL.
			MOV(32, R(tempReg2), MDisp(RSP, argOffset + 16 + 8));
#else
			// Extra 8 to account for CALL.
			MOV(32, R(tempReg2), MDisp(RSP, 24 + 48 + 8 + 8));
#endif
			LEA(32, tempReg2, MScaled(tempReg2, SCALE_4, 0));
		} else {
#ifdef _WIN32
			// The argument was saved on the stack.
			MOV(32, R(tempReg2), MDisp(RSP, 40));
			LEA(32, tempReg2, MScaled(tempReg2, SCALE_4, 0));
#else
			// We need to multiply by 16 and add, LEA allows us to copy too.
			LEA(32, tempReg2, MScaled(levelReg, SCALE_4, 0));
#endif
		}

		// Second step of the multiply by 16 (since we only multiplied by 4 before.)
		LEA(64, resultReg, MComplex(resultReg, tempReg2, SCALE_4, 0));
	}

	MOV(PTRBITS, R(tempReg1), ImmPtr(clut));

	switch ((GEPaletteFormat)id.clutfmt) {
	case GE_CMODE_16BIT_BGR5650:
		MOVZX(32, 16, resultReg, MComplex(tempReg1, resultReg, SCALE_2, 0));
		return Jit_Decode5650();

	case GE_CMODE_16BIT_ABGR5551:
		MOVZX(32, 16, resultReg, MComplex(tempReg1, resultReg, SCALE_2, 0));
		return Jit_Decode5551();

	case GE_CMODE_16BIT_ABGR4444:
		MOVZX(32, 16, resultReg, MComplex(tempReg1, resultReg, SCALE_2, 0));
		return Jit_Decode4444();

	case GE_CMODE_32BIT_ABGR8888:
		MOV(32, R(resultReg), MComplex(tempReg1, resultReg, SCALE_4, 0));
		return true;

	default:
		return false;
	}
}
Ejemplo n.º 5
0
std::shared_ptr<FunctionTree>
CoherentIntensity::createFunctionTree(const ParameterList &DataSample,
                                      const std::string &suffix) const {

  size_t n = DataSample.mDoubleValue(0)->values().size();

  auto NodeName = "CoherentIntensity(" + Name + ")" + suffix;

  auto tr = std::make_shared<FunctionTree>(
      NodeName, MDouble("", n), std::make_shared<AbsSquare>(ParType::MDOUBLE));

  tr->createNode("SumOfAmplitudes", MComplex("", n),
                 std::make_shared<AddAll>(ParType::MCOMPLEX), NodeName);

  for (auto i : Amplitudes) {
    std::shared_ptr<ComPWA::FunctionTree> resTree =
        i->createFunctionTree(DataSample, suffix);
    if (!resTree->sanityCheck())
      throw std::runtime_error("CoherentIntensity::createFunctionTree(): tree "
                               "didn't pass sanity check!");
    resTree->parameter();
    tr->insertTree(resTree, "SumOfAmplitudes");
  }

  return tr;
}
Ejemplo n.º 6
0
bool SamplerJitCache::Jit_GetTexDataSwizzled4() {
	// Get the horizontal tile pos into tempReg1.
	LEA(32, tempReg1, MScaled(uReg, SCALE_4, 0));
	// Note: imm8 sign extends negative.
	AND(32, R(tempReg1), Imm8(~127));

	// Add vertical offset inside tile to tempReg1.
	LEA(32, tempReg2, MScaled(vReg, SCALE_4, 0));
	AND(32, R(tempReg2), Imm8(31));
	LEA(32, tempReg1, MComplex(tempReg1, tempReg2, SCALE_4, 0));
	// Add srcReg, since we'll need it at some point.
	ADD(64, R(tempReg1), R(srcReg));

	// Now find the vertical tile pos, and add to tempReg1.
	SHR(32, R(vReg), Imm8(3));
	LEA(32, EAX, MScaled(bufwReg, SCALE_4, 0));
	MUL(32, R(vReg));
	ADD(64, R(tempReg1), R(EAX));

	// Last and possible also least, the horizontal offset inside the tile.
	AND(32, R(uReg), Imm8(31));
	SHR(32, R(uReg), Imm8(1));
	MOV(8, R(resultReg), MRegSum(tempReg1, uReg));
	FixupBranch skipNonZero = J_CC(CC_NC);
	// If the horizontal offset was odd, take the upper 4.
	SHR(8, R(resultReg), Imm8(4));
	SetJumpTarget(skipNonZero);
	// Zero out the rest of the bits.
	AND(32, R(resultReg), Imm8(0x0F));

	return true;
}
Ejemplo n.º 7
0
OpArg Jit::JitSafeMem::PrepareMemoryOpArg(ReadType type)
{
	// We may not even need to move into EAX as a temporary.
	bool needTemp = alignMask_ != 0xFFFFFFFF;
#ifdef _M_IX86
	// We always mask on 32 bit in fast memory mode.
	needTemp = needTemp || fast_;
#endif

	if (jit_->gpr.R(raddr_).IsSimpleReg() && !needTemp)
	{
		jit_->gpr.MapReg(raddr_, true, false);
		xaddr_ = jit_->gpr.RX(raddr_);
	}
	else
	{
		jit_->MOV(32, R(EAX), jit_->gpr.R(raddr_));
		xaddr_ = EAX;
	}

	MemCheckAsm(type);

	if (!fast_)
	{
		// Is it in physical ram?
		jit_->CMP(32, R(xaddr_), Imm32(PSP_GetKernelMemoryBase() - offset_));
		tooLow_ = jit_->J_CC(CC_B);
		jit_->CMP(32, R(xaddr_), Imm32(PSP_GetUserMemoryEnd() - offset_ - (size_ - 1)));
		tooHigh_ = jit_->J_CC(CC_AE);

		// We may need to jump back up here.
		safe_ = jit_->GetCodePtr();
	}
	else
	{
#ifdef _M_IX86
		jit_->AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
#endif
	}

	// TODO: This could be more optimal, but the common case is that we want xaddr_ not to include offset_.
	// Since we need to align them after add, we add and subtract.
	if (alignMask_ != 0xFFFFFFFF)
	{
		jit_->ADD(32, R(xaddr_), Imm32(offset_));
		jit_->AND(32, R(xaddr_), Imm32(alignMask_));
		jit_->SUB(32, R(xaddr_), Imm32(offset_));
	}

#ifdef _M_IX86
	return MDisp(xaddr_, (u32) Memory::base + offset_);
#else
	return MComplex(RBX, xaddr_, SCALE_1, offset_);
#endif
}
Ejemplo n.º 8
0
// Zero cache line.
void JitILBase::dcbz(UGeckoInstruction inst)
{
	FALLBACK_IF(true);

	// TODO!
#if 0
	if (SConfig::GetInstance().bJITOff || SConfig::GetInstance().bJITLoadStoreOff)
	{
		Default(inst);
		return;
	}
	INSTRUCTION_START;
	MOV(32, R(RSCRATCH), gpr.R(inst.RB));
	if (inst.RA)
		ADD(32, R(RSCRATCH), gpr.R(inst.RA));
	AND(32, R(RSCRATCH), Imm32(~31));
	PXOR(XMM0, R(XMM0));
	MOVAPS(MComplex(RMEM, RSCRATCH, SCALE_1, 0), XMM0);
	MOVAPS(MComplex(RMEM, RSCRATCH, SCALE_1, 16), XMM0);
#endif
}
Ejemplo n.º 9
0
OpArg Jit::JitSafeMem::PrepareMemoryOpArg()
{
	// We may not even need to move into EAX as a temporary.
	// TODO: Except on x86 in fastmem mode.
	if (jit_->gpr.R(raddr_).IsSimpleReg())
	{
		jit_->gpr.BindToRegister(raddr_, true, false);
		xaddr_ = jit_->gpr.RX(raddr_);
	}
	else
	{
		jit_->MOV(32, R(EAX), jit_->gpr.R(raddr_));
		xaddr_ = EAX;
	}

	if (!g_Config.bFastMemory)
	{
		// Is it in physical ram?
		jit_->CMP(32, R(xaddr_), Imm32(PSP_GetKernelMemoryBase() - offset_));
		tooLow_ = jit_->J_CC(CC_L);
		jit_->CMP(32, R(xaddr_), Imm32(PSP_GetUserMemoryEnd() - offset_));
		tooHigh_ = jit_->J_CC(CC_GE);

		// We may need to jump back up here.
		safe_ = jit_->GetCodePtr();
	}
	else
	{
#ifdef _M_IX86
		// Need to modify it, too bad.
		if (xaddr_ != EAX)
			jit_->MOV(32, R(EAX), R(xaddr_));
		jit_->AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
		xaddr_ = EAX;
#endif
	}

#ifdef _M_IX86
		return MDisp(xaddr_, (u32) Memory::base + offset_);
#else
		return MComplex(RBX, xaddr_, SCALE_1, offset_);
#endif
}
Ejemplo n.º 10
0
OpArg Jit::JitSafeMem::NextFastAddress(int suboffset)
{
	if (jit_->gpr.IsImmediate(raddr_))
	{
		u32 addr = jit_->gpr.GetImmediate32(raddr_) + offset_ + suboffset;

#ifdef _M_IX86
		return M(Memory::base + (addr & Memory::MEMVIEW32_MASK));
#else
		return MDisp(RBX, addr);
#endif
	}

#ifdef _M_IX86
	return MDisp(xaddr_, (u32) Memory::base + offset_ + suboffset);
#else
	return MComplex(RBX, xaddr_, SCALE_1, offset_ + suboffset);
#endif
}
Ejemplo n.º 11
0
std::shared_ptr<ComPWA::FunctionTree>
FormFactorDecorator::createFunctionTree(const ParameterList &DataSample,
                                         unsigned int pos,
                                         const std::string &suffix) const {

  // size_t sampleSize = DataSample.mDoubleValue(pos)->values().size();
  size_t sampleSize = DataSample.mDoubleValue(0)->values().size();

  std::string NodeName =
      "BreitWignerWithProductionFormFactor(" + Name + ")" + suffix;

  auto tr = std::make_shared<FunctionTree>(NodeName, MComplex("", sampleSize),
      std::make_shared<MultAll>(ParType::MCOMPLEX));

  std::string ffNodeName = "ProductionFormFactor(" + Name + ")" + suffix;
  auto ffTree = std::make_shared<FunctionTree>(ffNodeName,
      MDouble("", sampleSize), std::make_shared<FormFactorStrategy>());
  // add L and FFType as double value leaf, since there is no int leaf
  ffTree->createLeaf("OrbitalAngularMomentum", (double ) L, ffNodeName);
  ffTree->createLeaf("MesonRadius", MesonRadius, ffNodeName);
  ffTree->createLeaf("FormFactorType", (double) FFType, ffNodeName);
  ffTree->createLeaf("MassA", Daughter1Mass, ffNodeName);
  ffTree->createLeaf("MassB", Daughter2Mass, ffNodeName);
  ffTree->createLeaf("Data_mSq[" + std::to_string(pos) + "]",
                 DataSample.mDoubleValue(pos), ffNodeName);
  ffTree->parameter();

  tr->insertTree(ffTree, NodeName);

  std::shared_ptr<ComPWA::FunctionTree> breitWignerTree =
      UndecoratedBreitWigner->createFunctionTree(DataSample, pos, suffix);
  breitWignerTree->parameter();

  tr->insertTree(breitWignerTree, NodeName);

  if (!tr->sanityCheck())
    throw std::runtime_error(
        "FormFactorDecorator::createFunctionTree() | "
        "Tree didn't pass sanity check!");

  return tr;
};
Ejemplo n.º 12
0
OpArg Jit::JitSafeMem::NextFastAddress(int suboffset)
{
	if (jit_->gpr.IsImm(raddr_))
	{
		u32 addr = (jit_->gpr.GetImm(raddr_) + offset_ + suboffset) & alignMask_;

#ifdef _M_IX86
		return M(Memory::base + (addr & Memory::MEMVIEW32_MASK));
#else
		return MDisp(RBX, addr);
#endif
	}

	_dbg_assert_msg_(JIT, (suboffset & alignMask_) == suboffset, "suboffset must be aligned");

#ifdef _M_IX86
	return MDisp(xaddr_, (u32) Memory::base + offset_ + suboffset);
#else
	return MComplex(RBX, xaddr_, SCALE_1, offset_ + suboffset);
#endif
}
Ejemplo n.º 13
0
TEST_F(x64EmitterTest, PUSH_MComplex)
{
	emitter->PUSH(64, MComplex(RAX, RBX, SCALE_2, 4));
	ExpectDisassembly("push qword ptr ds:[rax+rbx*2+4]");
}
Ejemplo n.º 14
0
bool SamplerJitCache::Jit_GetTexDataSwizzled(const SamplerID &id, int bitsPerTexel) {
	if (bitsPerTexel == 4) {
		// Specialized implementation.
		return Jit_GetTexDataSwizzled4();
	}

	LEA(32, tempReg1, MScaled(vReg, SCALE_4, 0));
	AND(32, R(tempReg1), Imm8(31));
	AND(32, R(vReg), Imm8(~7));

	MOV(32, R(tempReg2), R(uReg));
	MOV(32, R(resultReg), R(uReg));
	switch (bitsPerTexel) {
	case 32:
		SHR(32, R(resultReg), Imm8(2));
		break;
	case 16:
		SHR(32, R(vReg), Imm8(1));
		SHR(32, R(tempReg2), Imm8(1));
		SHR(32, R(resultReg), Imm8(3));
		break;
	case 8:
		SHR(32, R(vReg), Imm8(2));
		SHR(32, R(tempReg2), Imm8(2));
		SHR(32, R(resultReg), Imm8(4));
		break;
	default:
		return false;
	}
	AND(32, R(tempReg2), Imm8(3));
	SHL(32, R(resultReg), Imm8(5));
	ADD(32, R(tempReg1), R(tempReg2));
	ADD(32, R(tempReg1), R(resultReg));

	// We may clobber srcReg in the MUL, so let's grab it now.
	LEA(64, tempReg1, MComplex(srcReg, tempReg1, SCALE_4, 0));

	LEA(32, EAX, MScaled(bufwReg, SCALE_4, 0));
	MUL(32, R(vReg));

	switch (bitsPerTexel) {
	case 32:
		MOV(bitsPerTexel, R(resultReg), MRegSum(tempReg1, EAX));
		break;
	case 16:
		AND(32, R(uReg), Imm8(1));
		// Multiply by two by just adding twice.
		ADD(32, R(EAX), R(uReg));
		ADD(32, R(EAX), R(uReg));
		MOVZX(32, bitsPerTexel, resultReg, MRegSum(tempReg1, EAX));
		break;
	case 8:
		AND(32, R(uReg), Imm8(3));
		ADD(32, R(EAX), R(uReg));
		MOVZX(32, bitsPerTexel, resultReg, MRegSum(tempReg1, EAX));
		break;
	default:
		return false;
	}

	return true;
}
void Jit64::lfd(UGeckoInstruction inst)
{
	INSTRUCTION_START
	JITDISABLE(bJITLoadStoreFloatingOff);
	FALLBACK_IF(js.memcheck || !inst.RA);

	int d = inst.RD;
	int a = inst.RA;

	s32 offset = (s32)(s16)inst.SIMM_16;
	gpr.FlushLockX(ABI_PARAM1);
	gpr.Lock(a);
	MOV(32, R(ABI_PARAM1), gpr.R(a));
	// TODO - optimize. This has to load the previous value - upper double should stay unmodified.
	fpr.Lock(d);
	fpr.BindToRegister(d, true);
	X64Reg xd = fpr.RX(d);

	if (cpu_info.bSSSE3)
	{
#if _M_X86_64
		MOVQ_xmm(XMM0, MComplex(RBX, ABI_PARAM1, SCALE_1, offset));
#else
		AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
		MOVQ_xmm(XMM0, MDisp(ABI_PARAM1, (u32)Memory::base + offset));
#endif
		PSHUFB(XMM0, M((void *)bswapShuffle1x8Dupe));
		MOVSD(xd, R(XMM0));
	} else {
#if _M_X86_64
		LoadAndSwap(64, EAX, MComplex(RBX, ABI_PARAM1, SCALE_1, offset));
		MOV(64, M(&temp64), R(EAX));

		MEMCHECK_START

		MOVSD(XMM0, M(&temp64));
		MOVSD(xd, R(XMM0));

		MEMCHECK_END
#else
		AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
		MOV(32, R(EAX), MDisp(ABI_PARAM1, (u32)Memory::base + offset));
		BSWAP(32, EAX);
		MOV(32, M((void*)((u8 *)&temp64+4)), R(EAX));

		MEMCHECK_START

		MOV(32, R(EAX), MDisp(ABI_PARAM1, (u32)Memory::base + offset + 4));
		BSWAP(32, EAX);
		MOV(32, M(&temp64), R(EAX));
		MOVSD(XMM0, M(&temp64));
		MOVSD(xd, R(XMM0));

		MEMCHECK_END
#endif
	}

	gpr.UnlockAll();
	gpr.UnlockAllX();
	fpr.UnlockAll();
}
void Jit64::stfd(UGeckoInstruction inst)
{
	INSTRUCTION_START
	JITDISABLE(bJITLoadStoreFloatingOff);
	FALLBACK_IF(js.memcheck || !inst.RA);

	int s = inst.RS;
	int a = inst.RA;

	u32 mem_mask = Memory::ADDR_MASK_HW_ACCESS;
	if (Core::g_CoreStartupParameter.bMMU ||
		Core::g_CoreStartupParameter.bTLBHack) {
			mem_mask |= Memory::ADDR_MASK_MEM1;
	}
#ifdef ENABLE_MEM_CHECK
	if (Core::g_CoreStartupParameter.bEnableDebugging)
	{
		mem_mask |= Memory::EXRAM_MASK;
	}
#endif

	gpr.FlushLockX(ABI_PARAM1);
	gpr.Lock(a);
	fpr.Lock(s);
	gpr.BindToRegister(a, true, false);

	s32 offset = (s32)(s16)inst.SIMM_16;
	LEA(32, ABI_PARAM1, MDisp(gpr.R(a).GetSimpleReg(), offset));
	TEST(32, R(ABI_PARAM1), Imm32(mem_mask));
	FixupBranch safe = J_CC(CC_NZ);

	// Fast routine
	if (cpu_info.bSSSE3) {
		MOVAPD(XMM0, fpr.R(s));
		PSHUFB(XMM0, M((void*)bswapShuffle1x8));
#if _M_X86_64
		MOVQ_xmm(MComplex(RBX, ABI_PARAM1, SCALE_1, 0), XMM0);
#else
		AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
		MOVQ_xmm(MDisp(ABI_PARAM1, (u32)Memory::base), XMM0);
#endif
	} else {
		MOVAPD(XMM0, fpr.R(s));
		MOVD_xmm(R(EAX), XMM0);
		UnsafeWriteRegToReg(EAX, ABI_PARAM1, 32, 4);

		PSRLQ(XMM0, 32);
		MOVD_xmm(R(EAX), XMM0);
		UnsafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0);
	}
	FixupBranch exit = J(true);
	SetJumpTarget(safe);

	// Safe but slow routine
	MOVAPD(XMM0, fpr.R(s));
	PSRLQ(XMM0, 32);
	MOVD_xmm(R(EAX), XMM0);
	SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0, RegistersInUse() | (1 << (16 + XMM0)));

	MOVAPD(XMM0, fpr.R(s));
	MOVD_xmm(R(EAX), XMM0);
	LEA(32, ABI_PARAM1, MDisp(gpr.R(a).GetSimpleReg(), offset));
	SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 4, RegistersInUse());

	SetJumpTarget(exit);

	gpr.UnlockAll();
	gpr.UnlockAllX();
	fpr.UnlockAll();
}