Example #1
0
PRIM_STATIC pstatus_t sse2_RGBToRGB_16s8u_P3AC4R(
	const INT16 *pSrc[3],	/* 16-bit R,G, and B arrays */
	INT32 srcStep,			/* bytes between rows in source data */
	BYTE *pDst,				/* 32-bit interleaved ARGB (ABGR?) data */
	INT32 dstStep,			/* bytes between rows in dest data */
	const prim_size_t *roi)	/* region of interest */
{
	const UINT16 *r = (const UINT16 *) (pSrc[0]);
	const UINT16 *g = (const UINT16 *) (pSrc[1]);
	const UINT16 *b = (const UINT16 *) (pSrc[2]);
	BYTE *out;
	int srcbump, dstbump, y;

	/* Ensure 16-byte alignment on all pointers,
	 * that width is a multiple of 8,
	 * and that the next row will also remain aligned.
	 * Since this is usually used for 64x64 aligned arrays,
	 * these checks should presumably pass.
	 */
	if ((((ULONG_PTR) (pSrc[0]) & 0x0f) != 0)
			|| (((ULONG_PTR) (pSrc[1]) & 0x0f) != 0)
			|| (((ULONG_PTR) (pSrc[2]) & 0x0f) != 0)
			|| (((ULONG_PTR) pDst & 0x0f) != 0)
			|| (roi->width & 0x0f)
			|| (srcStep & 0x0f)
			|| (dstStep & 0x0f))
	{
		return general_RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, roi);
	}

	out = (BYTE *) pDst;
	srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16);
	dstbump = (dstStep - (roi->width * sizeof(UINT32)));

	for (y=0; y<roi->height; ++y)
	{
		int width = roi->width;
		do {
			__m128i R0, R1, R2, R3, R4;
			/* The comments below pretend these are 8-byte registers
			 * rather than 16-byte, for readability.
			 */
			R0 = LOAD128(b);  b += 8;		/* R0 = 00B300B200B100B0 */
			R1 = LOAD128(b);  b += 8;		/* R1 = 00B700B600B500B4 */
			PACKUSWB(R0,R1);				/* R0 = B7B6B5B4B3B2B1B0 */
			R1 = LOAD128(g);  g += 8;		/* R1 = 00G300G200G100G0 */
			R2 = LOAD128(g);  g += 8;		/* R2 = 00G700G600G500G4 */
			PACKUSWB(R1,R2);				/* R1 = G7G6G5G4G3G2G1G0 */
			R2 = R1;						/* R2 = G7G6G5G4G3G2G1G0 */
			PUNPCKLBW(R2,R0);				/* R2 = G3B3G2B2G1B1G0B0 */
			PUNPCKHBW(R1,R0);				/* R1 = G7B7G6B7G5B5G4B4 */
			R0 = LOAD128(r);  r += 8;		/* R0 = 00R300R200R100R0 */
			R3 = LOAD128(r);  r += 8;		/* R3 = 00R700R600R500R4 */
			PACKUSWB(R0,R3);				/* R0 = R7R6R5R4R3R2R1R0 */
			R3 = XMM_ALL_ONES;				/* R3 = FFFFFFFFFFFFFFFF */
			R4 = R3;						/* R4 = FFFFFFFFFFFFFFFF */
			PUNPCKLBW(R4,R0);				/* R4 = FFR3FFR2FFR1FFR0 */
			PUNPCKHBW(R3,R0);				/* R3 = FFR7FFR6FFR5FFR4 */
			R0 = R4;						/* R0 = R4               */
			PUNPCKLWD(R0,R2);				/* R0 = FFR1G1B1FFR0G0B0 */
			PUNPCKHWD(R4,R2);				/* R4 = FFR3G3B3FFR2G2B2 */
			R2 = R3;						/* R2 = R3               */
			PUNPCKLWD(R2,R1);				/* R2 = FFR5G5B5FFR4G4B4 */
			PUNPCKHWD(R3,R1);				/* R3 = FFR7G7B7FFR6G6B6 */
			STORE128(out, R0);  out += 16;	/* FFR1G1B1FFR0G0B0      */
			STORE128(out, R4);  out += 16;	/* FFR3G3B3FFR2G2B2      */
			STORE128(out, R2);  out += 16;	/* FFR5G5B5FFR4G4B4      */
			STORE128(out, R3);  out += 16;	/* FFR7G7B7FFR6G6B6      */
		} while (width -= 16);
		/* Jump to next row. */
		r += srcbump;
		g += srcbump;
		b += srcbump;
		out += dstbump;
	}
	return PRIMITIVES_SUCCESS;
}
Example #2
0
LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) {
	_assert_msg_(G3D, id.linear, "Linear should be set on sampler id");
	BeginWrite();

	// We'll first write the nearest sampler, which we will CALL.
	// This may differ slightly based on the "linear" flag.
	const u8 *nearest = AlignCode16();

	if (!Jit_ReadTextureFormat(id)) {
		EndWrite();
		SetCodePtr(const_cast<u8 *>(nearest));
		return nullptr;
	}

	RET();

	// Now the actual linear func, which is exposed externally.
	const u8 *start = AlignCode16();

	// NOTE: This doesn't use the general register mapping.
	// POSIX: arg1=uptr, arg2=vptr, arg3=frac_u, arg4=frac_v, arg5=src, arg6=bufw, stack+8=level
	// Win64: arg1=uptr, arg2=vptr, arg3=frac_u, arg4=frac_v, stack+40=src, stack+48=bufw, stack+56=level
	//
	// We map these to nearest CALLs, with order: u, v, src, bufw, level

	// Let's start by saving a bunch of registers.
	PUSH(R15);
	PUSH(R14);
	PUSH(R13);
	PUSH(R12);
	// Won't need frac_u/frac_v for a while.
	PUSH(arg4Reg);
	PUSH(arg3Reg);
	// Extra space to restore alignment and save resultReg for lerp.
	// TODO: Maybe use XMMs instead?
	SUB(64, R(RSP), Imm8(24));

	MOV(64, R(R12), R(arg1Reg));
	MOV(64, R(R13), R(arg2Reg));
#ifdef _WIN32
	// First arg now starts at 24 (extra space) + 48 (pushed stack) + 8 (ret address) + 32 (shadow space)
	const int argOffset = 24 + 48 + 8 + 32;
	MOV(64, R(R14), MDisp(RSP, argOffset));
	MOV(32, R(R15), MDisp(RSP, argOffset + 8));
	// level is at argOffset + 16.
#else
	MOV(64, R(R14), R(arg5Reg));
	MOV(32, R(R15), R(arg6Reg));
	// level is at 24 + 48 + 8.
#endif

	// Early exit on !srcPtr.
	FixupBranch zeroSrc;
	if (id.hasInvalidPtr) {
		CMP(PTRBITS, R(R14), Imm8(0));
		FixupBranch nonZeroSrc = J_CC(CC_NZ);
		XOR(32, R(RAX), R(RAX));
		zeroSrc = J(true);
		SetJumpTarget(nonZeroSrc);
	}

	// At this point:
	// R12=uptr, R13=vptr, stack+24=frac_u, stack+32=frac_v, R14=src, R15=bufw, stack+X=level

	auto doNearestCall = [&](int off) {
		MOV(32, R(uReg), MDisp(R12, off));
		MOV(32, R(vReg), MDisp(R13, off));
		MOV(64, R(srcReg), R(R14));
		MOV(32, R(bufwReg), R(R15));
		// Leave level, we just always load from RAM.  Separate CLUTs is uncommon.

		CALL(nearest);
		MOV(32, MDisp(RSP, off), R(resultReg));
	};

	doNearestCall(0);
	doNearestCall(4);
	doNearestCall(8);
	doNearestCall(12);

	// Convert TL, TR, BL, BR to floats for easier blending.
	if (!cpu_info.bSSE4_1) {
		PXOR(XMM0, R(XMM0));
	}

	MOVD_xmm(fpScratchReg1, MDisp(RSP, 0));
	MOVD_xmm(fpScratchReg2, MDisp(RSP, 4));
	MOVD_xmm(fpScratchReg3, MDisp(RSP, 8));
	MOVD_xmm(fpScratchReg4, MDisp(RSP, 12));

	if (cpu_info.bSSE4_1) {
		PMOVZXBD(fpScratchReg1, R(fpScratchReg1));
		PMOVZXBD(fpScratchReg2, R(fpScratchReg2));
		PMOVZXBD(fpScratchReg3, R(fpScratchReg3));
		PMOVZXBD(fpScratchReg4, R(fpScratchReg4));
	} else {
		PUNPCKLBW(fpScratchReg1, R(XMM0));
		PUNPCKLBW(fpScratchReg2, R(XMM0));
		PUNPCKLBW(fpScratchReg3, R(XMM0));
		PUNPCKLBW(fpScratchReg4, R(XMM0));
		PUNPCKLWD(fpScratchReg1, R(XMM0));
		PUNPCKLWD(fpScratchReg2, R(XMM0));
		PUNPCKLWD(fpScratchReg3, R(XMM0));
		PUNPCKLWD(fpScratchReg4, R(XMM0));
	}
	CVTDQ2PS(fpScratchReg1, R(fpScratchReg1));
	CVTDQ2PS(fpScratchReg2, R(fpScratchReg2));
	CVTDQ2PS(fpScratchReg3, R(fpScratchReg3));
	CVTDQ2PS(fpScratchReg4, R(fpScratchReg4));

	// Okay, now multiply the R sides by frac_u, and L by (256 - frac_u)...
	MOVD_xmm(fpScratchReg5, MDisp(RSP, 24));
	CVTDQ2PS(fpScratchReg5, R(fpScratchReg5));
	SHUFPS(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0));
	if (RipAccessible(by256)) {
		MULPS(fpScratchReg5, M(by256));  // rip accessible
	} else {
		Crash();  // TODO
	}
	MOVAPS(XMM0, M(ones));
	SUBPS(XMM0, R(fpScratchReg5));

	MULPS(fpScratchReg1, R(XMM0));
	MULPS(fpScratchReg2, R(fpScratchReg5));
	MULPS(fpScratchReg3, R(XMM0));
	MULPS(fpScratchReg4, R(fpScratchReg5));

	// Now set top=fpScratchReg1, bottom=fpScratchReg3.
	ADDPS(fpScratchReg1, R(fpScratchReg2));
	ADDPS(fpScratchReg3, R(fpScratchReg4));

	// Next, time for frac_v.
	MOVD_xmm(fpScratchReg5, MDisp(RSP, 32));
	CVTDQ2PS(fpScratchReg5, R(fpScratchReg5));
	SHUFPS(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0));
	MULPS(fpScratchReg5, M(by256));
	MOVAPS(XMM0, M(ones));
	SUBPS(XMM0, R(fpScratchReg5));

	MULPS(fpScratchReg1, R(XMM0));
	MULPS(fpScratchReg3, R(fpScratchReg5));

	// Still at the 255 scale, now we're interpolated.
	ADDPS(fpScratchReg1, R(fpScratchReg3));

	// Time to convert back to a single 32 bit value.
	CVTPS2DQ(fpScratchReg1, R(fpScratchReg1));
	PACKSSDW(fpScratchReg1, R(fpScratchReg1));
	PACKUSWB(fpScratchReg1, R(fpScratchReg1));
	MOVD_xmm(R(resultReg), fpScratchReg1);

	if (id.hasInvalidPtr) {
		SetJumpTarget(zeroSrc);
	}

	ADD(64, R(RSP), Imm8(24));
	POP(arg3Reg);
	POP(arg4Reg);
	POP(R12);
	POP(R13);
	POP(R14);
	POP(R15);

	RET();

	EndWrite();
	return (LinearFunc)start;
}