Пример #1
0
//TODO: add optimized cases
void Jit64::ps_maddXX(UGeckoInstruction inst)
{
	INSTRUCTION_START
	JITDISABLE(bJITPairedOff);
	FALLBACK_IF(inst.Rc);

	int a = inst.FA;
	int b = inst.FB;
	int c = inst.FC;
	int d = inst.FD;
	fpr.Lock(a,b,c,d);

	MOVAPD(XMM0, fpr.R(a));
	switch (inst.SUBOP5)
	{
	case 14: //madds0
		MOVDDUP(XMM1, fpr.R(c));
		MULPD(XMM0, R(XMM1));
		ADDPD(XMM0, fpr.R(b));
		break;
	case 15: //madds1
		MOVAPD(XMM1, fpr.R(c));
		SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower
		MULPD(XMM0, R(XMM1));
		ADDPD(XMM0, fpr.R(b));
		break;
	case 28: //msub
		MULPD(XMM0, fpr.R(c));
		SUBPD(XMM0, fpr.R(b));
		break;
	case 29: //madd
		MULPD(XMM0, fpr.R(c));
		ADDPD(XMM0, fpr.R(b));
		break;
	case 30: //nmsub
		MULPD(XMM0, fpr.R(c));
		SUBPD(XMM0, fpr.R(b));
		PXOR(XMM0, M((void*)&psSignBits));
		break;
	case 31: //nmadd
		MULPD(XMM0, fpr.R(c));
		ADDPD(XMM0, fpr.R(b));
		PXOR(XMM0, M((void*)&psSignBits));
		break;
	default:
		_assert_msg_(DYNA_REC, 0, "ps_maddXX WTF!!!");
		//FallBackToInterpreter(inst);
		//fpr.UnlockAll();
		return;
	}
	fpr.BindToRegister(d, false);
	MOVAPD(fpr.RX(d), Gen::R(XMM0));
	ForceSinglePrecisionP(fpr.RX(d));
	fpr.UnlockAll();
}
void Jit64::fmaddXX(UGeckoInstruction inst)
{
	INSTRUCTION_START
	JITDISABLE(bJITFloatingPointOff);
	FALLBACK_IF(inst.Rc);

	// Only the interpreter has "proper" support for (some) FP flags
	FALLBACK_IF(inst.SUBOP5 == 29 && Core::g_CoreStartupParameter.bEnableFPRF);

	bool single_precision = inst.OPCD == 59;

	int a = inst.FA;
	int b = inst.FB;
	int c = inst.FC;
	int d = inst.FD;

	fpr.Lock(a, b, c, d);
	MOVSD(XMM0, fpr.R(a));
	switch (inst.SUBOP5)
	{
	case 28: //msub
		MULSD(XMM0, fpr.R(c));
		SUBSD(XMM0, fpr.R(b));
		break;
	case 29: //madd
		MULSD(XMM0, fpr.R(c));
		ADDSD(XMM0, fpr.R(b));
		break;
	case 30: //nmsub
		MULSD(XMM0, fpr.R(c));
		SUBSD(XMM0, fpr.R(b));
		PXOR(XMM0, M((void*)&psSignBits2));
		break;
	case 31: //nmadd
		MULSD(XMM0, fpr.R(c));
		ADDSD(XMM0, fpr.R(b));
		PXOR(XMM0, M((void*)&psSignBits2));
		break;
	}
	fpr.BindToRegister(d, false);
	//YES it is necessary to dupe the result :(
	//TODO : analysis - does the top reg get used? If so, dupe, if not, don't.
	if (single_precision) {
		ForceSinglePrecisionS(XMM0);
		MOVDDUP(fpr.RX(d), R(XMM0));
	} else {
		MOVSD(fpr.RX(d), R(XMM0));
	}
	// SMB checks flags after this op. Let's lie.
	//AND(32, M(&PowerPC::ppcState.fpscr), Imm32(~((0x80000000 >> 19) | (0x80000000 >> 15))));
	//OR(32, M(&PowerPC::ppcState.fpscr), Imm32((0x80000000 >> 16)));
	fpr.UnlockAll();
}
Пример #3
0
void Jit64::ps_sign(UGeckoInstruction inst)
{
	INSTRUCTION_START
	JITDISABLE(bJITPairedOff);
	FALLBACK_IF(inst.Rc);

	int d = inst.FD;
	int b = inst.FB;

	fpr.Lock(d, b);
	if (d != b)
	{
		fpr.BindToRegister(d, false);
		MOVAPD(fpr.RX(d), fpr.R(b));
	}
	else
	{
		fpr.BindToRegister(d, true);
	}

	switch (inst.SUBOP10)
	{
	case 40: //neg
		PXOR(fpr.RX(d), M((void*)&psSignBits));
		break;
	case 136: //nabs
		POR(fpr.RX(d), M((void*)&psSignBits));
		break;
	case 264: //abs
		PAND(fpr.RX(d), M((void*)&psAbsMask));
		break;
	}

	fpr.UnlockAll();
}
Пример #4
0
void Jit64::ps_sel(UGeckoInstruction inst)
{
	// we can't use (V)BLENDVPD here because it just looks at the sign bit
	// but we need -0 = +0

	INSTRUCTION_START
	JITDISABLE(bJITPairedOff);
	FALLBACK_IF(inst.Rc);

	int d = inst.FD;
	int a = inst.FA;
	int b = inst.FB;
	int c = inst.FC;

	fpr.Lock(a, b, c, d);
	MOVAPD(XMM0, fpr.R(a));
	PXOR(XMM1, R(XMM1));
	// XMM0 = XMM0 < 0 ? all 1s : all 0s
	CMPPD(XMM0, R(XMM1), LT);
	MOVAPD(XMM1, R(XMM0));
	PAND(XMM0, fpr.R(b));
	PANDN(XMM1, fpr.R(c));
	POR(XMM0, R(XMM1));
	fpr.BindToRegister(d, false);
	MOVAPD(fpr.RX(d), R(XMM0));
	fpr.UnlockAll();
}
void Jit64::fsign(UGeckoInstruction inst)
{
	INSTRUCTION_START
	JITDISABLE(bJITFloatingPointOff);
	FALLBACK_IF(inst.Rc);

	int d = inst.FD;
	int b = inst.FB;
	fpr.Lock(b, d);
	fpr.BindToRegister(d, true, true);
	MOVSD(XMM0, fpr.R(b));
	switch (inst.SUBOP10) {
	case 40:  // fnegx
		PXOR(XMM0, M((void*)&psSignBits2));
		break;
	case 264: // fabsx
		PAND(XMM0, M((void*)&psAbsMask2));
		break;
	case 136: // fnabs
		POR(XMM0, M((void*)&psSignBits2));
		break;
	default:
		PanicAlert("fsign bleh");
		break;
	}
	MOVSD(fpr.R(d), XMM0);
	fpr.UnlockAll();
}
Пример #6
0
// Zero cache line.
void JitILBase::dcbz(UGeckoInstruction inst)
{
	FALLBACK_IF(true);

	// TODO!
#if 0
	if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreOff)
	{
		Default(inst);
		return;
	}
	INSTRUCTION_START;
		MOV(32, R(EAX), gpr.R(inst.RB));
	if (inst.RA)
		ADD(32, R(EAX), gpr.R(inst.RA));
	AND(32, R(EAX), Imm32(~31));
	PXOR(XMM0, R(XMM0));
#if _M_X86_64
	MOVAPS(MComplex(EBX, EAX, SCALE_1, 0), XMM0);
	MOVAPS(MComplex(EBX, EAX, SCALE_1, 16), XMM0);
#else
	AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
	MOVAPS(MDisp(EAX, (u32)Memory::base), XMM0);
	MOVAPS(MDisp(EAX, (u32)Memory::base + 16), XMM0);
#endif
#endif
}
Пример #7
0
// Zero cache line.
void JitILBase::dcbz(UGeckoInstruction inst)
{
	FALLBACK_IF(true);

	// TODO!
#if 0
	if (SConfig::GetInstance().bJITOff || SConfig::GetInstance().bJITLoadStoreOff)
	{
		Default(inst);
		return;
	}
	INSTRUCTION_START;
	MOV(32, R(RSCRATCH), gpr.R(inst.RB));
	if (inst.RA)
		ADD(32, R(RSCRATCH), gpr.R(inst.RA));
	AND(32, R(RSCRATCH), Imm32(~31));
	PXOR(XMM0, R(XMM0));
	MOVAPS(MComplex(RMEM, RSCRATCH, SCALE_1, 0), XMM0);
	MOVAPS(MComplex(RMEM, RSCRATCH, SCALE_1, 16), XMM0);
#endif
}
Пример #8
0
void Jit::Comp_FPU2op(MIPSOpcode op) {
	CONDITIONAL_DISABLE;
	
	int fs = _FS;
	int fd = _FD;

	switch (op & 0x3f) 
	{
	case 5:	//F(fd)	= fabsf(F(fs)); break; //abs
		fpr.SpillLock(fd, fs);
		fpr.MapReg(fd, fd == fs, true);
		MOVSS(fpr.RX(fd), fpr.R(fs));
		PAND(fpr.RX(fd), M(ssNoSignMask));
		break;

	case 6:	//F(fd)	= F(fs);				break; //mov
		if (fd != fs) {
			fpr.SpillLock(fd, fs);
			fpr.MapReg(fd, fd == fs, true);
			MOVSS(fpr.RX(fd), fpr.R(fs));
		}
		break;

	case 7:	//F(fd)	= -F(fs);			 break; //neg
		fpr.SpillLock(fd, fs);
		fpr.MapReg(fd, fd == fs, true);
		MOVSS(fpr.RX(fd), fpr.R(fs));
		PXOR(fpr.RX(fd), M(ssSignBits2));
		break;


	case 4:	//F(fd)	= sqrtf(F(fs)); break; //sqrt
		fpr.SpillLock(fd, fs); // this probably works, just badly tested
		fpr.MapReg(fd, fd == fs, true);
		SQRTSS(fpr.RX(fd), fpr.R(fs));
		break;

	case 13: //FsI(fd) = F(fs)>=0 ? (int)floorf(F(fs)) : (int)ceilf(F(fs)); break;//trunc.w.s
		{
			fpr.SpillLock(fs, fd);
			fpr.StoreFromRegister(fd);
			CVTTSS2SI(EAX, fpr.R(fs));

			// Did we get an indefinite integer value?
			CMP(32, R(EAX), Imm32(0x80000000));
			FixupBranch skip = J_CC(CC_NE);
			MOVSS(XMM0, fpr.R(fs));
			XORPS(XMM1, R(XMM1));
			CMPSS(XMM0, R(XMM1), CMP_LT);

			// At this point, -inf = 0xffffffff, inf/nan = 0x00000000.
			// We want -inf to be 0x80000000 inf/nan to be 0x7fffffff, so we flip those bits.
			MOVD_xmm(R(EAX), XMM0);
			XOR(32, R(EAX), Imm32(0x7fffffff));

			SetJumpTarget(skip);
			MOV(32, fpr.R(fd), R(EAX));
		}
		break;

	case 32: //F(fd)	= (float)FsI(fs);			break; //cvt.s.w
		// Store to memory so we can read it as an integer value.
		fpr.StoreFromRegister(fs);
		CVTSI2SS(XMM0, fpr.R(fs));
		MOVSS(fpr.R(fd), XMM0);
		break;

	case 12: //FsI(fd) = (int)floorf(F(fs)+0.5f); break; //round.w.s
	case 14: //FsI(fd) = (int)ceilf (F(fs)); break; //ceil.w.s
	case 15: //FsI(fd) = (int)floorf(F(fs)); break; //floor.w.s
	case 36: //FsI(fd) = (int)	F(fs);			 break; //cvt.w.s
	default:
		DISABLE;
		return;
	}
	fpr.ReleaseSpillLocks();
}
Пример #9
0
void Jit::Comp_FPU2op(u32 op)
{
	CONDITIONAL_DISABLE;
	
	int fs = _FS;
	int fd = _FD;

	switch (op & 0x3f) 
	{
	case 5:	//F(fd)	= fabsf(F(fs)); break; //abs
		fpr.Lock(fd, fs);
		fpr.BindToRegister(fd, fd == fs, true);
		MOVSS(fpr.RX(fd), fpr.R(fs));
		PAND(fpr.RX(fd), M((void *)ssNoSignMask));
		fpr.UnlockAll();
		break;

	case 6:	//F(fd)	= F(fs);				break; //mov
		if (fd != fs) {
			fpr.Lock(fd, fs);
			fpr.BindToRegister(fd, fd == fs, true);
			MOVSS(fpr.RX(fd), fpr.R(fs));
			fpr.UnlockAll();
		}
		break;

	case 7:	//F(fd)	= -F(fs);			 break; //neg
		fpr.Lock(fd, fs);
		fpr.BindToRegister(fd, fd == fs, true);
		MOVSS(fpr.RX(fd), fpr.R(fs));
		PXOR(fpr.RX(fd), M((void *)ssSignBits2));
		fpr.UnlockAll();
		break;

	case 12: //FsI(fd) = (int)floorf(F(fs)+0.5f); break; //round.w.s

	case 4:	//F(fd)	= sqrtf(F(fs)); break; //sqrt
/*		fpr.Lock(fd, fs); // this probably works, just badly tested
		fpr.BindToRegister(fd, fd == fs, true);
		SQRTSS(fpr.RX(fd), fpr.R(fs));
		fpr.UnlockAll();
		break;*/ 
		Comp_Generic(op);
		return;

	case 13: //FsI(fd) = F(fs)>=0 ? (int)floorf(F(fs)) : (int)ceilf(F(fs)); break;//trunc.w.s
		fpr.Lock(fs, fd);
		fpr.StoreFromRegister(fd);
		CVTTSS2SI(EAX, fpr.R(fs));
		MOV(32, fpr.R(fd), R(EAX));
		fpr.UnlockAll();
		break;

	case 14: //FsI(fd) = (int)ceilf (F(fs)); break; //ceil.w.s
	case 15: //FsI(fd) = (int)floorf(F(fs)); break; //floor.w.s
	case 32: //F(fd)	= (float)FsI(fs);			break; //cvt.s.w
	case 36: //FsI(fd) = (int)	F(fs);			 break; //cvt.w.s
	default:
		Comp_Generic(op);
		return;
	}
}
Пример #10
0
LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
	_assert_msg_(G3D, id.linear, "Linear should be set on sampler id");
	BeginWrite();

	// We'll first write the nearest sampler, which we will CALL.
	// This may differ slightly based on the "linear" flag.
	const u8 *nearest = AlignCode16();

	if (!Jit_ReadTextureFormat(id)) {
		EndWrite();
		SetCodePtr(const_cast<u8 *>(nearest));
		return nullptr;
	}

	RET();

	// Now the actual linear func, which is exposed externally.
	const u8 *start = AlignCode16();

	// NOTE: This doesn't use the general register mapping.
	// POSIX: arg1=uptr, arg2=vptr, arg3=frac_u, arg4=frac_v, arg5=src, arg6=bufw, stack+8=level
	// Win64: arg1=uptr, arg2=vptr, arg3=frac_u, arg4=frac_v, stack+40=src, stack+48=bufw, stack+56=level
	//
	// We map these to nearest CALLs, with order: u, v, src, bufw, level

	// Let's start by saving a bunch of registers.
	PUSH(R15);
	PUSH(R14);
	PUSH(R13);
	PUSH(R12);
	// Won't need frac_u/frac_v for a while.
	PUSH(arg4Reg);
	PUSH(arg3Reg);
	// Extra space to restore alignment and save resultReg for lerp.
	// TODO: Maybe use XMMs instead?
	SUB(64, R(RSP), Imm8(24));

	MOV(64, R(R12), R(arg1Reg));
	MOV(64, R(R13), R(arg2Reg));
#ifdef _WIN32
	// First arg now starts at 24 (extra space) + 48 (pushed stack) + 8 (ret address) + 32 (shadow space)
	const int argOffset = 24 + 48 + 8 + 32;
	MOV(64, R(R14), MDisp(RSP, argOffset));
	MOV(32, R(R15), MDisp(RSP, argOffset + 8));
	// level is at argOffset + 16.
#else
	MOV(64, R(R14), R(arg5Reg));
	MOV(32, R(R15), R(arg6Reg));
	// level is at 24 + 48 + 8.
#endif

	// Early exit on !srcPtr.
	FixupBranch zeroSrc;
	if (id.hasInvalidPtr) {
		CMP(PTRBITS, R(R14), Imm8(0));
		FixupBranch nonZeroSrc = J_CC(CC_NZ);
		XOR(32, R(RAX), R(RAX));
		zeroSrc = J(true);
		SetJumpTarget(nonZeroSrc);
	}

	// At this point:
	// R12=uptr, R13=vptr, stack+24=frac_u, stack+32=frac_v, R14=src, R15=bufw, stack+X=level

	auto doNearestCall = [&](int off) {
		MOV(32, R(uReg), MDisp(R12, off));
		MOV(32, R(vReg), MDisp(R13, off));
		MOV(64, R(srcReg), R(R14));
		MOV(32, R(bufwReg), R(R15));
		// Leave level, we just always load from RAM.  Separate CLUTs is uncommon.

		CALL(nearest);
		MOV(32, MDisp(RSP, off), R(resultReg));
	};

	doNearestCall(0);
	doNearestCall(4);
	doNearestCall(8);
	doNearestCall(12);

	// Convert TL, TR, BL, BR to floats for easier blending.
	if (!cpu_info.bSSE4_1) {
		PXOR(XMM0, R(XMM0));
	}

	MOVD_xmm(fpScratchReg1, MDisp(RSP, 0));
	MOVD_xmm(fpScratchReg2, MDisp(RSP, 4));
	MOVD_xmm(fpScratchReg3, MDisp(RSP, 8));
	MOVD_xmm(fpScratchReg4, MDisp(RSP, 12));

	if (cpu_info.bSSE4_1) {
		PMOVZXBD(fpScratchReg1, R(fpScratchReg1));
		PMOVZXBD(fpScratchReg2, R(fpScratchReg2));
		PMOVZXBD(fpScratchReg3, R(fpScratchReg3));
		PMOVZXBD(fpScratchReg4, R(fpScratchReg4));
	} else {
		PUNPCKLBW(fpScratchReg1, R(XMM0));
		PUNPCKLBW(fpScratchReg2, R(XMM0));
		PUNPCKLBW(fpScratchReg3, R(XMM0));
		PUNPCKLBW(fpScratchReg4, R(XMM0));
		PUNPCKLWD(fpScratchReg1, R(XMM0));
		PUNPCKLWD(fpScratchReg2, R(XMM0));
		PUNPCKLWD(fpScratchReg3, R(XMM0));
		PUNPCKLWD(fpScratchReg4, R(XMM0));
	}
	CVTDQ2PS(fpScratchReg1, R(fpScratchReg1));
	CVTDQ2PS(fpScratchReg2, R(fpScratchReg2));
	CVTDQ2PS(fpScratchReg3, R(fpScratchReg3));
	CVTDQ2PS(fpScratchReg4, R(fpScratchReg4));

	// Okay, now multiply the R sides by frac_u, and L by (256 - frac_u)...
	MOVD_xmm(fpScratchReg5, MDisp(RSP, 24));
	CVTDQ2PS(fpScratchReg5, R(fpScratchReg5));
	SHUFPS(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0));
	if (RipAccessible(by256)) {
		MULPS(fpScratchReg5, M(by256));  // rip accessible
	} else {
		Crash();  // TODO
	}
	MOVAPS(XMM0, M(ones));
	SUBPS(XMM0, R(fpScratchReg5));

	MULPS(fpScratchReg1, R(XMM0));
	MULPS(fpScratchReg2, R(fpScratchReg5));
	MULPS(fpScratchReg3, R(XMM0));
	MULPS(fpScratchReg4, R(fpScratchReg5));

	// Now set top=fpScratchReg1, bottom=fpScratchReg3.
	ADDPS(fpScratchReg1, R(fpScratchReg2));
	ADDPS(fpScratchReg3, R(fpScratchReg4));

	// Next, time for frac_v.
	MOVD_xmm(fpScratchReg5, MDisp(RSP, 32));
	CVTDQ2PS(fpScratchReg5, R(fpScratchReg5));
	SHUFPS(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0));
	MULPS(fpScratchReg5, M(by256));
	MOVAPS(XMM0, M(ones));
	SUBPS(XMM0, R(fpScratchReg5));

	MULPS(fpScratchReg1, R(XMM0));
	MULPS(fpScratchReg3, R(fpScratchReg5));

	// Still at the 255 scale, now we're interpolated.
	ADDPS(fpScratchReg1, R(fpScratchReg3));

	// Time to convert back to a single 32 bit value.
	CVTPS2DQ(fpScratchReg1, R(fpScratchReg1));
	PACKSSDW(fpScratchReg1, R(fpScratchReg1));
	PACKUSWB(fpScratchReg1, R(fpScratchReg1));
	MOVD_xmm(R(resultReg), fpScratchReg1);

	if (id.hasInvalidPtr) {
		SetJumpTarget(zeroSrc);
	}

	ADD(64, R(RSP), Imm8(24));
	POP(arg3Reg);
	POP(arg4Reg);
	POP(R12);
	POP(R13);
	POP(R14);
	POP(R15);

	RET();

	EndWrite();
	return (LinearFunc)start;
}