Exemplos de MDisp em C++ (Cpp)

Exemplo n.º 1

0

Exibir arquivo

Arquivo: RegCacheFPU.cpp Projeto: Kingcom/ppsspp

OpArg FPURegCache::GetDefaultLocation(int reg) const {
	if (reg < 32) {
		return MDisp(CTXREG, reg * 4);
	} else if (reg < 32 + 128) {
		return M(&mips->v[voffset[reg - 32]]);
	} else {
		return M(&tempValues[reg - 32 - 128]);
	}
}

Exemplo n.º 2

0

Exibir arquivo

Arquivo: Jit.cpp Projeto: Xele02/ppsspp-savesystem

void Jit::JitSafeMem::DoSlowWrite(void *safeFunc, const OpArg src, int suboffset)
{
	if (iaddr_ != (u32) -1)
		jit_->MOV(32, R(EAX), Imm32(iaddr_ + suboffset));
	else
		jit_->LEA(32, EAX, MDisp(xaddr_, offset_ + suboffset));

	jit_->ABI_CallFunctionAA(jit_->thunks.ProtectFunction(safeFunc, 2), src, R(EAX));
	needsCheck_ = true;
}

Exemplo n.º 3

0

Exibir arquivo

Arquivo: Jit.cpp Projeto: Xele02/ppsspp-savesystem

OpArg Jit::JitSafeMem::NextFastAddress(int suboffset)
{
	if (jit_->gpr.IsImmediate(raddr_))
	{
		u32 addr = jit_->gpr.GetImmediate32(raddr_) + offset_ + suboffset;

#ifdef _M_IX86
		return M(Memory::base + (addr & Memory::MEMVIEW32_MASK));
#else
		return MDisp(RBX, addr);
#endif
	}

#ifdef _M_IX86
	return MDisp(xaddr_, (u32) Memory::base + offset_ + suboffset);
#else
	return MComplex(RBX, xaddr_, SCALE_1, offset_ + suboffset);
#endif
}

Exemplo n.º 4

0

Exibir arquivo

Arquivo: Jit.cpp Projeto: Versus9/ppsspp

OpArg Jit::JitSafeMem::PrepareMemoryOpArg(ReadType type)
{
	// We may not even need to move into EAX as a temporary.
	bool needTemp = alignMask_ != 0xFFFFFFFF;
#ifdef _M_IX86
	// We always mask on 32 bit in fast memory mode.
	needTemp = needTemp || fast_;
#endif

	if (jit_->gpr.R(raddr_).IsSimpleReg() && !needTemp)
	{
		jit_->gpr.MapReg(raddr_, true, false);
		xaddr_ = jit_->gpr.RX(raddr_);
	}
	else
	{
		jit_->MOV(32, R(EAX), jit_->gpr.R(raddr_));
		xaddr_ = EAX;
	}

	MemCheckAsm(type);

	if (!fast_)
	{
		// Is it in physical ram?
		jit_->CMP(32, R(xaddr_), Imm32(PSP_GetKernelMemoryBase() - offset_));
		tooLow_ = jit_->J_CC(CC_B);
		jit_->CMP(32, R(xaddr_), Imm32(PSP_GetUserMemoryEnd() - offset_ - (size_ - 1)));
		tooHigh_ = jit_->J_CC(CC_AE);

		// We may need to jump back up here.
		safe_ = jit_->GetCodePtr();
	}
	else
	{
#ifdef _M_IX86
		jit_->AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
#endif
	}

	// TODO: This could be more optimal, but the common case is that we want xaddr_ not to include offset_.
	// Since we need to align them after add, we add and subtract.
	if (alignMask_ != 0xFFFFFFFF)
	{
		jit_->ADD(32, R(xaddr_), Imm32(offset_));
		jit_->AND(32, R(xaddr_), Imm32(alignMask_));
		jit_->SUB(32, R(xaddr_), Imm32(offset_));
	}

#ifdef _M_IX86
	return MDisp(xaddr_, (u32) Memory::base + offset_);
#else
	return MComplex(RBX, xaddr_, SCALE_1, offset_);
#endif
}

Exemplo n.º 5

0

Exibir arquivo

Arquivo: Jit.cpp Projeto: Versus9/ppsspp

OpArg Jit::JitSafeMem::NextFastAddress(int suboffset)
{
	if (jit_->gpr.IsImm(raddr_))
	{
		u32 addr = (jit_->gpr.GetImm(raddr_) + offset_ + suboffset) & alignMask_;

#ifdef _M_IX86
		return M(Memory::base + (addr & Memory::MEMVIEW32_MASK));
#else
		return MDisp(RBX, addr);
#endif
	}

	_dbg_assert_msg_(JIT, (suboffset & alignMask_) == suboffset, "suboffset must be aligned");

#ifdef _M_IX86
	return MDisp(xaddr_, (u32) Memory::base + offset_ + suboffset);
#else
	return MComplex(RBX, xaddr_, SCALE_1, offset_ + suboffset);
#endif
}

Exemplo n.º 6

0

Exibir arquivo

Arquivo: Jit.cpp Projeto: Versus9/ppsspp

void Jit::JitSafeMem::DoSlowWrite(void *safeFunc, const OpArg src, int suboffset)
{
	if (iaddr_ != (u32) -1)
		jit_->MOV(32, R(EAX), Imm32((iaddr_ + suboffset) & alignMask_));
	else
	{
		jit_->LEA(32, EAX, MDisp(xaddr_, offset_ + suboffset));
		if (alignMask_ != 0xFFFFFFFF)
			jit_->AND(32, R(EAX), Imm32(alignMask_));
	}

	jit_->CallProtectedFunction(safeFunc, src, R(EAX));
	needsCheck_ = true;
}

Exemplo n.º 7

0

Exibir arquivo

Arquivo: JitILBase_LoadStore.cpp Projeto: Catnips/dolphin

// Zero cache line.
void JitILBase::dcbz(UGeckoInstruction inst)
{
	FALLBACK_IF(true);

	// TODO!
#if 0
	if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreOff)
		{Default(inst); return;} // turn off from debugger
	INSTRUCTION_START;
		MOV(32, R(EAX), gpr.R(inst.RB));
	if (inst.RA)
		ADD(32, R(EAX), gpr.R(inst.RA));
	AND(32, R(EAX), Imm32(~31));
	PXOR(XMM0, R(XMM0));
#if _M_X86_64
	MOVAPS(MComplex(EBX, EAX, SCALE_1, 0), XMM0);
	MOVAPS(MComplex(EBX, EAX, SCALE_1, 16), XMM0);
#else
	AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
	MOVAPS(MDisp(EAX, (u32)Memory::base), XMM0);
	MOVAPS(MDisp(EAX, (u32)Memory::base + 16), XMM0);
#endif
#endif
}

Exemplo n.º 8

0

Exibir arquivo

Arquivo: Jit.cpp Projeto: Xele02/ppsspp-savesystem

OpArg Jit::JitSafeMem::PrepareMemoryOpArg()
{
	// We may not even need to move into EAX as a temporary.
	// TODO: Except on x86 in fastmem mode.
	if (jit_->gpr.R(raddr_).IsSimpleReg())
	{
		jit_->gpr.BindToRegister(raddr_, true, false);
		xaddr_ = jit_->gpr.RX(raddr_);
	}
	else
	{
		jit_->MOV(32, R(EAX), jit_->gpr.R(raddr_));
		xaddr_ = EAX;
	}

	if (!g_Config.bFastMemory)
	{
		// Is it in physical ram?
		jit_->CMP(32, R(xaddr_), Imm32(PSP_GetKernelMemoryBase() - offset_));
		tooLow_ = jit_->J_CC(CC_L);
		jit_->CMP(32, R(xaddr_), Imm32(PSP_GetUserMemoryEnd() - offset_));
		tooHigh_ = jit_->J_CC(CC_GE);

		// We may need to jump back up here.
		safe_ = jit_->GetCodePtr();
	}
	else
	{
#ifdef _M_IX86
		// Need to modify it, too bad.
		if (xaddr_ != EAX)
			jit_->MOV(32, R(EAX), R(xaddr_));
		jit_->AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
		xaddr_ = EAX;
#endif
	}

#ifdef _M_IX86
		return MDisp(xaddr_, (u32) Memory::base + offset_);
#else
		return MComplex(RBX, xaddr_, SCALE_1, offset_);
#endif
}

Exemplo n.º 9

0

Exibir arquivo

Arquivo: Jit.cpp Projeto: Xele02/ppsspp-savesystem

void Jit::JitSafeMem::NextSlowRead(void *safeFunc, int suboffset)
{
	_dbg_assert_msg_(JIT, !g_Config.bFastMemory, "NextSlowRead() called in fast memory mode?");

	// For simplicity, do nothing for 0.  We already read in PrepareSlowRead().
	if (suboffset == 0)
		return;

	if (jit_->gpr.IsImmediate(raddr_))
	{
		_dbg_assert_msg_(JIT, !Memory::IsValidAddress(iaddr_), "NextSlowRead() for a valid immediate address?");

		jit_->MOV(32, R(EAX), Imm32(iaddr_ + suboffset));
	}
	// For GPR, if xaddr_ was the dest register, this will be wrong.  Don't use in GPR.
	else
		jit_->LEA(32, EAX, MDisp(xaddr_, offset_ + suboffset));

	jit_->ABI_CallFunctionA(jit_->thunks.ProtectFunction(safeFunc, 1), R(EAX));
}

Exemplo n.º 10

0

Exibir arquivo

Arquivo: Jit.cpp Projeto: Xele02/ppsspp-savesystem

bool Jit::JitSafeMem::PrepareRead(OpArg &src)
{
	if (iaddr_ != (u32) -1)
	{
		if (Memory::IsValidAddress(iaddr_))
		{
#ifdef _M_IX86
			src = M(Memory::base + (iaddr_ & Memory::MEMVIEW32_MASK));
#else
			src = MDisp(RBX, iaddr_);
#endif
			return true;
		}
		else
			return false;
	}
	else
		src = PrepareMemoryOpArg();
	return true;
}

Exemplo n.º 11

0

Exibir arquivo

Arquivo: Jit.cpp Projeto: Xele02/ppsspp-savesystem

bool Jit::JitSafeMem::PrepareWrite(OpArg &dest)
{
	// If it's an immediate, we can do the write if valid.
	if (iaddr_ != (u32) -1)
	{
		if (Memory::IsValidAddress(iaddr_))
		{
#ifdef _M_IX86
			dest = M(Memory::base + (iaddr_ & Memory::MEMVIEW32_MASK));
#else
			dest = MDisp(RBX, iaddr_);
#endif
			return true;
		}
		else
			return false;
	}
	// Otherwise, we always can do the write (conditionally.)
	else
		dest = PrepareMemoryOpArg();
	return true;
}

Exemplo n.º 12

0

Exibir arquivo

Arquivo: Jit.cpp Projeto: Versus9/ppsspp

bool Jit::JitSafeMem::PrepareRead(OpArg &src, int size)
{
	size_ = size;
	if (iaddr_ != (u32) -1)
	{
		if (ImmValid())
		{
			MemCheckImm(MEM_READ);

#ifdef _M_IX86
			src = M(Memory::base + (iaddr_ & Memory::MEMVIEW32_MASK & alignMask_));
#else
			src = MDisp(RBX, iaddr_ & alignMask_);
#endif
			return true;
		}
		else
			return false;
	}
	else
		src = PrepareMemoryOpArg(MEM_READ);
	return true;
}

Exemplo n.º 13

0

Exibir arquivo

Arquivo: Jit.cpp Projeto: Xele02/ppsspp-savesystem

bool Jit::JitSafeMem::PrepareSlowRead(void *safeFunc)
{
	if (!g_Config.bFastMemory)
	{
		if (iaddr_ != (u32) -1)
		{
			// No slow read necessary.
			if (Memory::IsValidAddress(iaddr_))
				return false;
			jit_->MOV(32, R(EAX), Imm32(iaddr_));
		}
		else
		{
			PrepareSlowAccess();
			jit_->LEA(32, EAX, MDisp(xaddr_, offset_));
		}

		jit_->ABI_CallFunctionA(jit_->thunks.ProtectFunction(safeFunc, 1), R(EAX));
		needsCheck_ = true;
		return true;
	}
	else
		return false;
}

Exemplo n.º 14

0

Exibir arquivo

Arquivo: Jit.cpp Projeto: Versus9/ppsspp

void Jit::JitSafeMem::NextSlowRead(void *safeFunc, int suboffset)
{
	_dbg_assert_msg_(JIT, !fast_, "NextSlowRead() called in fast memory mode?");

	// For simplicity, do nothing for 0.  We already read in PrepareSlowRead().
	if (suboffset == 0)
		return;

	if (jit_->gpr.IsImm(raddr_))
	{
		_dbg_assert_msg_(JIT, !Memory::IsValidAddress(iaddr_ + suboffset), "NextSlowRead() for an invalid immediate address?");

		jit_->MOV(32, R(EAX), Imm32((iaddr_ + suboffset) & alignMask_));
	}
	// For GPR, if xaddr_ was the dest register, this will be wrong.  Don't use in GPR.
	else
	{
		jit_->LEA(32, EAX, MDisp(xaddr_, offset_ + suboffset));
		if (alignMask_ != 0xFFFFFFFF)
			jit_->AND(32, R(EAX), Imm32(alignMask_));
	}

	jit_->CallProtectedFunction(safeFunc, R(EAX));
}

Exemplo n.º 15

0

Exibir arquivo

Arquivo: SamplerX86.cpp Projeto: Orphis/ppsspp

LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
	_assert_msg_(G3D, id.linear, "Linear should be set on sampler id");
	BeginWrite();

	// We'll first write the nearest sampler, which we will CALL.
	// This may differ slightly based on the "linear" flag.
	const u8 *nearest = AlignCode16();

	if (!Jit_ReadTextureFormat(id)) {
		EndWrite();
		SetCodePtr(const_cast<u8 *>(nearest));
		return nullptr;
	}

	RET();

	// Now the actual linear func, which is exposed externally.
	const u8 *start = AlignCode16();

	// NOTE: This doesn't use the general register mapping.
	// POSIX: arg1=uptr, arg2=vptr, arg3=frac_u, arg4=frac_v, arg5=src, arg6=bufw, stack+8=level
	// Win64: arg1=uptr, arg2=vptr, arg3=frac_u, arg4=frac_v, stack+40=src, stack+48=bufw, stack+56=level
	//
	// We map these to nearest CALLs, with order: u, v, src, bufw, level

	// Let's start by saving a bunch of registers.
	PUSH(R15);
	PUSH(R14);
	PUSH(R13);
	PUSH(R12);
	// Won't need frac_u/frac_v for a while.
	PUSH(arg4Reg);
	PUSH(arg3Reg);
	// Extra space to restore alignment and save resultReg for lerp.
	// TODO: Maybe use XMMs instead?
	SUB(64, R(RSP), Imm8(24));

	MOV(64, R(R12), R(arg1Reg));
	MOV(64, R(R13), R(arg2Reg));
#ifdef _WIN32
	// First arg now starts at 24 (extra space) + 48 (pushed stack) + 8 (ret address) + 32 (shadow space)
	const int argOffset = 24 + 48 + 8 + 32;
	MOV(64, R(R14), MDisp(RSP, argOffset));
	MOV(32, R(R15), MDisp(RSP, argOffset + 8));
	// level is at argOffset + 16.
#else
	MOV(64, R(R14), R(arg5Reg));
	MOV(32, R(R15), R(arg6Reg));
	// level is at 24 + 48 + 8.
#endif

	// Early exit on !srcPtr.
	FixupBranch zeroSrc;
	if (id.hasInvalidPtr) {
		CMP(PTRBITS, R(R14), Imm8(0));
		FixupBranch nonZeroSrc = J_CC(CC_NZ);
		XOR(32, R(RAX), R(RAX));
		zeroSrc = J(true);
		SetJumpTarget(nonZeroSrc);
	}

	// At this point:
	// R12=uptr, R13=vptr, stack+24=frac_u, stack+32=frac_v, R14=src, R15=bufw, stack+X=level

	auto doNearestCall = [&](int off) {
		MOV(32, R(uReg), MDisp(R12, off));
		MOV(32, R(vReg), MDisp(R13, off));
		MOV(64, R(srcReg), R(R14));
		MOV(32, R(bufwReg), R(R15));
		// Leave level, we just always load from RAM.  Separate CLUTs is uncommon.

		CALL(nearest);
		MOV(32, MDisp(RSP, off), R(resultReg));
	};

	doNearestCall(0);
	doNearestCall(4);
	doNearestCall(8);
	doNearestCall(12);

	// Convert TL, TR, BL, BR to floats for easier blending.
	if (!cpu_info.bSSE4_1) {
		PXOR(XMM0, R(XMM0));
	}

	MOVD_xmm(fpScratchReg1, MDisp(RSP, 0));
	MOVD_xmm(fpScratchReg2, MDisp(RSP, 4));
	MOVD_xmm(fpScratchReg3, MDisp(RSP, 8));
	MOVD_xmm(fpScratchReg4, MDisp(RSP, 12));

	if (cpu_info.bSSE4_1) {
		PMOVZXBD(fpScratchReg1, R(fpScratchReg1));
		PMOVZXBD(fpScratchReg2, R(fpScratchReg2));
		PMOVZXBD(fpScratchReg3, R(fpScratchReg3));
		PMOVZXBD(fpScratchReg4, R(fpScratchReg4));
	} else {
		PUNPCKLBW(fpScratchReg1, R(XMM0));
		PUNPCKLBW(fpScratchReg2, R(XMM0));
		PUNPCKLBW(fpScratchReg3, R(XMM0));
		PUNPCKLBW(fpScratchReg4, R(XMM0));
		PUNPCKLWD(fpScratchReg1, R(XMM0));
		PUNPCKLWD(fpScratchReg2, R(XMM0));
		PUNPCKLWD(fpScratchReg3, R(XMM0));
		PUNPCKLWD(fpScratchReg4, R(XMM0));
	}
	CVTDQ2PS(fpScratchReg1, R(fpScratchReg1));
	CVTDQ2PS(fpScratchReg2, R(fpScratchReg2));
	CVTDQ2PS(fpScratchReg3, R(fpScratchReg3));
	CVTDQ2PS(fpScratchReg4, R(fpScratchReg4));

	// Okay, now multiply the R sides by frac_u, and L by (256 - frac_u)...
	MOVD_xmm(fpScratchReg5, MDisp(RSP, 24));
	CVTDQ2PS(fpScratchReg5, R(fpScratchReg5));
	SHUFPS(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0));
	if (RipAccessible(by256)) {
		MULPS(fpScratchReg5, M(by256));  // rip accessible
	} else {
		Crash();  // TODO
	}
	MOVAPS(XMM0, M(ones));
	SUBPS(XMM0, R(fpScratchReg5));

	MULPS(fpScratchReg1, R(XMM0));
	MULPS(fpScratchReg2, R(fpScratchReg5));
	MULPS(fpScratchReg3, R(XMM0));
	MULPS(fpScratchReg4, R(fpScratchReg5));

	// Now set top=fpScratchReg1, bottom=fpScratchReg3.
	ADDPS(fpScratchReg1, R(fpScratchReg2));
	ADDPS(fpScratchReg3, R(fpScratchReg4));

	// Next, time for frac_v.
	MOVD_xmm(fpScratchReg5, MDisp(RSP, 32));
	CVTDQ2PS(fpScratchReg5, R(fpScratchReg5));
	SHUFPS(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0));
	MULPS(fpScratchReg5, M(by256));
	MOVAPS(XMM0, M(ones));
	SUBPS(XMM0, R(fpScratchReg5));

	MULPS(fpScratchReg1, R(XMM0));
	MULPS(fpScratchReg3, R(fpScratchReg5));

	// Still at the 255 scale, now we're interpolated.
	ADDPS(fpScratchReg1, R(fpScratchReg3));

	// Time to convert back to a single 32 bit value.
	CVTPS2DQ(fpScratchReg1, R(fpScratchReg1));
	PACKSSDW(fpScratchReg1, R(fpScratchReg1));
	PACKUSWB(fpScratchReg1, R(fpScratchReg1));
	MOVD_xmm(R(resultReg), fpScratchReg1);

	if (id.hasInvalidPtr) {
		SetJumpTarget(zeroSrc);
	}

	ADD(64, R(RSP), Imm8(24));
	POP(arg3Reg);
	POP(arg4Reg);
	POP(R12);
	POP(R13);
	POP(R14);
	POP(R15);

	RET();

	EndWrite();
	return (LinearFunc)start;
}

Exemplo n.º 16

0

Exibir arquivo

Arquivo: Jit_LoadStoreFloating.cpp Projeto: KoreanFoodComics/dolphin

void Jit64::stfd(UGeckoInstruction inst)
{
	INSTRUCTION_START
	JITDISABLE(bJITLoadStoreFloatingOff);
	FALLBACK_IF(js.memcheck || !inst.RA);

	int s = inst.RS;
	int a = inst.RA;

	u32 mem_mask = Memory::ADDR_MASK_HW_ACCESS;
	if (Core::g_CoreStartupParameter.bMMU ||
		Core::g_CoreStartupParameter.bTLBHack) {
			mem_mask |= Memory::ADDR_MASK_MEM1;
	}
#ifdef ENABLE_MEM_CHECK
	if (Core::g_CoreStartupParameter.bEnableDebugging)
	{
		mem_mask |= Memory::EXRAM_MASK;
	}
#endif

	gpr.FlushLockX(ABI_PARAM1);
	gpr.Lock(a);
	fpr.Lock(s);
	gpr.BindToRegister(a, true, false);

	s32 offset = (s32)(s16)inst.SIMM_16;
	LEA(32, ABI_PARAM1, MDisp(gpr.R(a).GetSimpleReg(), offset));
	TEST(32, R(ABI_PARAM1), Imm32(mem_mask));
	FixupBranch safe = J_CC(CC_NZ);

	// Fast routine
	if (cpu_info.bSSSE3) {
		MOVAPD(XMM0, fpr.R(s));
		PSHUFB(XMM0, M((void*)bswapShuffle1x8));
#if _M_X86_64
		MOVQ_xmm(MComplex(RBX, ABI_PARAM1, SCALE_1, 0), XMM0);
#else
		AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK));
		MOVQ_xmm(MDisp(ABI_PARAM1, (u32)Memory::base), XMM0);
#endif
	} else {
		MOVAPD(XMM0, fpr.R(s));
		MOVD_xmm(R(EAX), XMM0);
		UnsafeWriteRegToReg(EAX, ABI_PARAM1, 32, 4);

		PSRLQ(XMM0, 32);
		MOVD_xmm(R(EAX), XMM0);
		UnsafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0);
	}
	FixupBranch exit = J(true);
	SetJumpTarget(safe);

	// Safe but slow routine
	MOVAPD(XMM0, fpr.R(s));
	PSRLQ(XMM0, 32);
	MOVD_xmm(R(EAX), XMM0);
	SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0, RegistersInUse() | (1 << (16 + XMM0)));

	MOVAPD(XMM0, fpr.R(s));
	MOVD_xmm(R(EAX), XMM0);
	LEA(32, ABI_PARAM1, MDisp(gpr.R(a).GetSimpleReg(), offset));
	SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 4, RegistersInUse());

	SetJumpTarget(exit);

	gpr.UnlockAll();
	gpr.UnlockAllX();
	fpr.UnlockAll();
}

Exemplo n.º 17

0

Exibir arquivo

Arquivo: Jit_LoadStoreFloating.cpp Projeto: KoreanFoodComics/dolphin

void Jit64::lfd(UGeckoInstruction inst)
{
	INSTRUCTION_START
	JITDISABLE(bJITLoadStoreFloatingOff);
	FALLBACK_IF(js.memcheck || !inst.RA);

	int d = inst.RD;
	int a = inst.RA;

	s32 offset = (s32)(s16)inst.SIMM_16;
	gpr.FlushLockX(ABI_PARAM1);
	gpr.Lock(a);
	MOV(32, R(ABI_PARAM1), gpr.R(a));
	// TODO - optimize. This has to load the previous value - upper double should stay unmodified.
	fpr.Lock(d);
	fpr.BindToRegister(d, true);
	X64Reg xd = fpr.RX(d);

	if (cpu_info.bSSSE3)
	{
#if _M_X86_64
		MOVQ_xmm(XMM0, MComplex(RBX, ABI_PARAM1, SCALE_1, offset));
#else
		AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
		MOVQ_xmm(XMM0, MDisp(ABI_PARAM1, (u32)Memory::base + offset));
#endif
		PSHUFB(XMM0, M((void *)bswapShuffle1x8Dupe));
		MOVSD(xd, R(XMM0));
	} else {
#if _M_X86_64
		LoadAndSwap(64, EAX, MComplex(RBX, ABI_PARAM1, SCALE_1, offset));
		MOV(64, M(&temp64), R(EAX));

		MEMCHECK_START

		MOVSD(XMM0, M(&temp64));
		MOVSD(xd, R(XMM0));

		MEMCHECK_END
#else
		AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK));
		MOV(32, R(EAX), MDisp(ABI_PARAM1, (u32)Memory::base + offset));
		BSWAP(32, EAX);
		MOV(32, M((void*)((u8 *)&temp64+4)), R(EAX));

		MEMCHECK_START

		MOV(32, R(EAX), MDisp(ABI_PARAM1, (u32)Memory::base + offset + 4));
		BSWAP(32, EAX);
		MOV(32, M(&temp64), R(EAX));
		MOVSD(XMM0, M(&temp64));
		MOVSD(xd, R(XMM0));

		MEMCHECK_END
#endif
	}

	gpr.UnlockAll();
	gpr.UnlockAllX();
	fpr.UnlockAll();
}