예제 #1
0
파일: CompFPU.cpp 프로젝트: AmesianX/ppsspp
void Jit::Comp_FPU2op(MIPSOpcode op) {
	CONDITIONAL_DISABLE(FPU);
	
	int fs = _FS;
	int fd = _FD;

	auto execRounding = [&](void (XEmitter::*conv)(X64Reg, OpArg), int setMXCSR) {
		fpr.SpillLock(fd, fs);
		fpr.MapReg(fd, fs == fd, true);

		// Small optimization: 0 is our default mode anyway.
		if (setMXCSR == 0 && !js.hasSetRounding) {
			setMXCSR = -1;
		}
		if (setMXCSR != -1) {
			STMXCSR(MIPSSTATE_VAR(mxcsrTemp));
			MOV(32, R(TEMPREG), MIPSSTATE_VAR(mxcsrTemp));
			AND(32, R(TEMPREG), Imm32(~(3 << 13)));
			OR(32, R(TEMPREG), Imm32(setMXCSR << 13));
			MOV(32, MIPSSTATE_VAR(temp), R(TEMPREG));
			LDMXCSR(MIPSSTATE_VAR(temp));
		}

		(this->*conv)(TEMPREG, fpr.R(fs));

		// Did we get an indefinite integer value?
		CMP(32, R(TEMPREG), Imm32(0x80000000));
		FixupBranch skip = J_CC(CC_NE);
		if (fd != fs) {
			CopyFPReg(fpr.RX(fd), fpr.R(fs));
		}
		XORPS(XMM1, R(XMM1));
		CMPSS(fpr.RX(fd), R(XMM1), CMP_LT);

		// At this point, -inf = 0xffffffff, inf/nan = 0x00000000.
		// We want -inf to be 0x80000000 inf/nan to be 0x7fffffff, so we flip those bits.
		MOVD_xmm(R(TEMPREG), fpr.RX(fd));
		XOR(32, R(TEMPREG), Imm32(0x7fffffff));

		SetJumpTarget(skip);
		MOVD_xmm(fpr.RX(fd), R(TEMPREG));

		if (setMXCSR != -1) {
			LDMXCSR(MIPSSTATE_VAR(mxcsrTemp));
		}
	};

	switch (op & 0x3f) {
	case 5:	//F(fd)	= fabsf(F(fs)); break; //abs
		fpr.SpillLock(fd, fs);
		fpr.MapReg(fd, fd == fs, true);
		MOV(PTRBITS, R(TEMPREG), ImmPtr(&ssNoSignMask[0]));
		if (fd != fs && fpr.IsMapped(fs)) {
			MOVAPS(fpr.RX(fd), MatR(TEMPREG));
			ANDPS(fpr.RX(fd), fpr.R(fs));
		} else {
			if (fd != fs) {
				MOVSS(fpr.RX(fd), fpr.R(fs));
			}
			ANDPS(fpr.RX(fd), MatR(TEMPREG));
		}
		break;

	case 6:	//F(fd)	= F(fs);				break; //mov
		if (fd != fs) {
			fpr.SpillLock(fd, fs);
			fpr.MapReg(fd, fd == fs, true);
			CopyFPReg(fpr.RX(fd), fpr.R(fs));
		}
		break;

	case 7:	//F(fd)	= -F(fs);			 break; //neg
		fpr.SpillLock(fd, fs);
		fpr.MapReg(fd, fd == fs, true);
		MOV(PTRBITS, R(TEMPREG), ImmPtr(&ssSignBits2[0]));
		if (fd != fs && fpr.IsMapped(fs)) {
			MOVAPS(fpr.RX(fd), MatR(TEMPREG));
			XORPS(fpr.RX(fd), fpr.R(fs));
		} else {
			if (fd != fs) {
				MOVSS(fpr.RX(fd), fpr.R(fs));
			}
			XORPS(fpr.RX(fd), MatR(TEMPREG));
		}
		break;

	case 4:	//F(fd)	= sqrtf(F(fs)); break; //sqrt
		fpr.SpillLock(fd, fs);
		fpr.MapReg(fd, fd == fs, true);
		SQRTSS(fpr.RX(fd), fpr.R(fs));
		break;

	case 13: //FsI(fd) = F(fs)>=0 ? (int)floorf(F(fs)) : (int)ceilf(F(fs)); break; //trunc.w.s
		execRounding(&XEmitter::CVTTSS2SI, -1);
		break;

	case 32: //F(fd)	= (float)FsI(fs);			break; //cvt.s.w
		fpr.SpillLock(fd, fs);
		fpr.MapReg(fd, fs == fd, true);
		if (fpr.IsMapped(fs)) {
			CVTDQ2PS(fpr.RX(fd), fpr.R(fs));
		} else {
			// If fs was fd, we'd be in the case above since we mapped fd.
			MOVSS(fpr.RX(fd), fpr.R(fs));
			CVTDQ2PS(fpr.RX(fd), fpr.R(fd));
		}
		break;

	case 36: //FsI(fd) = (int)	F(fs);			 break; //cvt.w.s
		// Uses the current rounding mode.
		execRounding(&XEmitter::CVTSS2SI, -1);
		break;

	case 12: //FsI(fd) = (int)floorf(F(fs)+0.5f); break; //round.w.s
		execRounding(&XEmitter::CVTSS2SI, 0);
		break;
	case 14: //FsI(fd) = (int)ceilf (F(fs)); break; //ceil.w.s
		execRounding(&XEmitter::CVTSS2SI, 2);
		break;
	case 15: //FsI(fd) = (int)floorf(F(fs)); break; //floor.w.s
		execRounding(&XEmitter::CVTSS2SI, 1);
		break;
	default:
		DISABLE;
		return;
	}
	fpr.ReleaseSpillLocks();
}
예제 #2
0
LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
	_assert_msg_(G3D, id.linear, "Linear should be set on sampler id");
	BeginWrite();

	// We'll first write the nearest sampler, which we will CALL.
	// This may differ slightly based on the "linear" flag.
	const u8 *nearest = AlignCode16();

	if (!Jit_ReadTextureFormat(id)) {
		EndWrite();
		SetCodePtr(const_cast<u8 *>(nearest));
		return nullptr;
	}

	RET();

	// Now the actual linear func, which is exposed externally.
	const u8 *start = AlignCode16();

	// NOTE: This doesn't use the general register mapping.
	// POSIX: arg1=uptr, arg2=vptr, arg3=frac_u, arg4=frac_v, arg5=src, arg6=bufw, stack+8=level
	// Win64: arg1=uptr, arg2=vptr, arg3=frac_u, arg4=frac_v, stack+40=src, stack+48=bufw, stack+56=level
	//
	// We map these to nearest CALLs, with order: u, v, src, bufw, level

	// Let's start by saving a bunch of registers.
	PUSH(R15);
	PUSH(R14);
	PUSH(R13);
	PUSH(R12);
	// Won't need frac_u/frac_v for a while.
	PUSH(arg4Reg);
	PUSH(arg3Reg);
	// Extra space to restore alignment and save resultReg for lerp.
	// TODO: Maybe use XMMs instead?
	SUB(64, R(RSP), Imm8(24));

	MOV(64, R(R12), R(arg1Reg));
	MOV(64, R(R13), R(arg2Reg));
#ifdef _WIN32
	// First arg now starts at 24 (extra space) + 48 (pushed stack) + 8 (ret address) + 32 (shadow space)
	const int argOffset = 24 + 48 + 8 + 32;
	MOV(64, R(R14), MDisp(RSP, argOffset));
	MOV(32, R(R15), MDisp(RSP, argOffset + 8));
	// level is at argOffset + 16.
#else
	MOV(64, R(R14), R(arg5Reg));
	MOV(32, R(R15), R(arg6Reg));
	// level is at 24 + 48 + 8.
#endif

	// Early exit on !srcPtr.
	FixupBranch zeroSrc;
	if (id.hasInvalidPtr) {
		CMP(PTRBITS, R(R14), Imm8(0));
		FixupBranch nonZeroSrc = J_CC(CC_NZ);
		XOR(32, R(RAX), R(RAX));
		zeroSrc = J(true);
		SetJumpTarget(nonZeroSrc);
	}

	// At this point:
	// R12=uptr, R13=vptr, stack+24=frac_u, stack+32=frac_v, R14=src, R15=bufw, stack+X=level

	auto doNearestCall = [&](int off) {
		MOV(32, R(uReg), MDisp(R12, off));
		MOV(32, R(vReg), MDisp(R13, off));
		MOV(64, R(srcReg), R(R14));
		MOV(32, R(bufwReg), R(R15));
		// Leave level, we just always load from RAM.  Separate CLUTs is uncommon.

		CALL(nearest);
		MOV(32, MDisp(RSP, off), R(resultReg));
	};

	doNearestCall(0);
	doNearestCall(4);
	doNearestCall(8);
	doNearestCall(12);

	// Convert TL, TR, BL, BR to floats for easier blending.
	if (!cpu_info.bSSE4_1) {
		PXOR(XMM0, R(XMM0));
	}

	MOVD_xmm(fpScratchReg1, MDisp(RSP, 0));
	MOVD_xmm(fpScratchReg2, MDisp(RSP, 4));
	MOVD_xmm(fpScratchReg3, MDisp(RSP, 8));
	MOVD_xmm(fpScratchReg4, MDisp(RSP, 12));

	if (cpu_info.bSSE4_1) {
		PMOVZXBD(fpScratchReg1, R(fpScratchReg1));
		PMOVZXBD(fpScratchReg2, R(fpScratchReg2));
		PMOVZXBD(fpScratchReg3, R(fpScratchReg3));
		PMOVZXBD(fpScratchReg4, R(fpScratchReg4));
	} else {
		PUNPCKLBW(fpScratchReg1, R(XMM0));
		PUNPCKLBW(fpScratchReg2, R(XMM0));
		PUNPCKLBW(fpScratchReg3, R(XMM0));
		PUNPCKLBW(fpScratchReg4, R(XMM0));
		PUNPCKLWD(fpScratchReg1, R(XMM0));
		PUNPCKLWD(fpScratchReg2, R(XMM0));
		PUNPCKLWD(fpScratchReg3, R(XMM0));
		PUNPCKLWD(fpScratchReg4, R(XMM0));
	}
	CVTDQ2PS(fpScratchReg1, R(fpScratchReg1));
	CVTDQ2PS(fpScratchReg2, R(fpScratchReg2));
	CVTDQ2PS(fpScratchReg3, R(fpScratchReg3));
	CVTDQ2PS(fpScratchReg4, R(fpScratchReg4));

	// Okay, now multiply the R sides by frac_u, and L by (256 - frac_u)...
	MOVD_xmm(fpScratchReg5, MDisp(RSP, 24));
	CVTDQ2PS(fpScratchReg5, R(fpScratchReg5));
	SHUFPS(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0));
	if (RipAccessible(by256)) {
		MULPS(fpScratchReg5, M(by256));  // rip accessible
	} else {
		Crash();  // TODO
	}
	MOVAPS(XMM0, M(ones));
	SUBPS(XMM0, R(fpScratchReg5));

	MULPS(fpScratchReg1, R(XMM0));
	MULPS(fpScratchReg2, R(fpScratchReg5));
	MULPS(fpScratchReg3, R(XMM0));
	MULPS(fpScratchReg4, R(fpScratchReg5));

	// Now set top=fpScratchReg1, bottom=fpScratchReg3.
	ADDPS(fpScratchReg1, R(fpScratchReg2));
	ADDPS(fpScratchReg3, R(fpScratchReg4));

	// Next, time for frac_v.
	MOVD_xmm(fpScratchReg5, MDisp(RSP, 32));
	CVTDQ2PS(fpScratchReg5, R(fpScratchReg5));
	SHUFPS(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0));
	MULPS(fpScratchReg5, M(by256));
	MOVAPS(XMM0, M(ones));
	SUBPS(XMM0, R(fpScratchReg5));

	MULPS(fpScratchReg1, R(XMM0));
	MULPS(fpScratchReg3, R(fpScratchReg5));

	// Still at the 255 scale, now we're interpolated.
	ADDPS(fpScratchReg1, R(fpScratchReg3));

	// Time to convert back to a single 32 bit value.
	CVTPS2DQ(fpScratchReg1, R(fpScratchReg1));
	PACKSSDW(fpScratchReg1, R(fpScratchReg1));
	PACKUSWB(fpScratchReg1, R(fpScratchReg1));
	MOVD_xmm(R(resultReg), fpScratchReg1);

	if (id.hasInvalidPtr) {
		SetJumpTarget(zeroSrc);
	}

	ADD(64, R(RSP), Imm8(24));
	POP(arg3Reg);
	POP(arg4Reg);
	POP(R12);
	POP(R13);
	POP(R14);
	POP(R15);

	RET();

	EndWrite();
	return (LinearFunc)start;
}