PRIM_STATIC pstatus_t sse2_RGBToRGB_16s8u_P3AC4R( const INT16 *pSrc[3], /* 16-bit R,G, and B arrays */ INT32 srcStep, /* bytes between rows in source data */ BYTE *pDst, /* 32-bit interleaved ARGB (ABGR?) data */ INT32 dstStep, /* bytes between rows in dest data */ const prim_size_t *roi) /* region of interest */ { const UINT16 *r = (const UINT16 *) (pSrc[0]); const UINT16 *g = (const UINT16 *) (pSrc[1]); const UINT16 *b = (const UINT16 *) (pSrc[2]); BYTE *out; int srcbump, dstbump, y; /* Ensure 16-byte alignment on all pointers, * that width is a multiple of 8, * and that the next row will also remain aligned. * Since this is usually used for 64x64 aligned arrays, * these checks should presumably pass. */ if ((((ULONG_PTR) (pSrc[0]) & 0x0f) != 0) || (((ULONG_PTR) (pSrc[1]) & 0x0f) != 0) || (((ULONG_PTR) (pSrc[2]) & 0x0f) != 0) || (((ULONG_PTR) pDst & 0x0f) != 0) || (roi->width & 0x0f) || (srcStep & 0x0f) || (dstStep & 0x0f)) { return general_RGBToRGB_16s8u_P3AC4R(pSrc, srcStep, pDst, dstStep, roi); } out = (BYTE *) pDst; srcbump = (srcStep - (roi->width * sizeof(UINT16))) / sizeof(UINT16); dstbump = (dstStep - (roi->width * sizeof(UINT32))); for (y=0; y<roi->height; ++y) { int width = roi->width; do { __m128i R0, R1, R2, R3, R4; /* The comments below pretend these are 8-byte registers * rather than 16-byte, for readability. */ R0 = LOAD128(b); b += 8; /* R0 = 00B300B200B100B0 */ R1 = LOAD128(b); b += 8; /* R1 = 00B700B600B500B4 */ PACKUSWB(R0,R1); /* R0 = B7B6B5B4B3B2B1B0 */ R1 = LOAD128(g); g += 8; /* R1 = 00G300G200G100G0 */ R2 = LOAD128(g); g += 8; /* R2 = 00G700G600G500G4 */ PACKUSWB(R1,R2); /* R1 = G7G6G5G4G3G2G1G0 */ R2 = R1; /* R2 = G7G6G5G4G3G2G1G0 */ PUNPCKLBW(R2,R0); /* R2 = G3B3G2B2G1B1G0B0 */ PUNPCKHBW(R1,R0); /* R1 = G7B7G6B7G5B5G4B4 */ R0 = LOAD128(r); r += 8; /* R0 = 00R300R200R100R0 */ R3 = LOAD128(r); r += 8; /* R3 = 00R700R600R500R4 */ PACKUSWB(R0,R3); /* R0 = R7R6R5R4R3R2R1R0 */ R3 = XMM_ALL_ONES; /* R3 = FFFFFFFFFFFFFFFF */ R4 = R3; /* R4 = FFFFFFFFFFFFFFFF */ PUNPCKLBW(R4,R0); /* R4 = FFR3FFR2FFR1FFR0 */ PUNPCKHBW(R3,R0); /* R3 = FFR7FFR6FFR5FFR4 */ R0 = R4; /* R0 = R4 */ PUNPCKLWD(R0,R2); /* R0 = FFR1G1B1FFR0G0B0 */ PUNPCKHWD(R4,R2); /* R4 = FFR3G3B3FFR2G2B2 */ R2 = R3; /* R2 = R3 */ PUNPCKLWD(R2,R1); /* R2 = FFR5G5B5FFR4G4B4 */ PUNPCKHWD(R3,R1); /* R3 = FFR7G7B7FFR6G6B6 */ STORE128(out, R0); out += 16; /* FFR1G1B1FFR0G0B0 */ STORE128(out, R4); out += 16; /* FFR3G3B3FFR2G2B2 */ STORE128(out, R2); out += 16; /* FFR5G5B5FFR4G4B4 */ STORE128(out, R3); out += 16; /* FFR7G7B7FFR6G6B6 */ } while (width -= 16); /* Jump to next row. */ r += srcbump; g += srcbump; b += srcbump; out += dstbump; } return PRIMITIVES_SUCCESS; }
LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) { _assert_msg_(G3D, id.linear, "Linear should be set on sampler id"); BeginWrite(); // We'll first write the nearest sampler, which we will CALL. // This may differ slightly based on the "linear" flag. const u8 *nearest = AlignCode16(); if (!Jit_ReadTextureFormat(id)) { EndWrite(); SetCodePtr(const_cast<u8 *>(nearest)); return nullptr; } RET(); // Now the actual linear func, which is exposed externally. const u8 *start = AlignCode16(); // NOTE: This doesn't use the general register mapping. // POSIX: arg1=uptr, arg2=vptr, arg3=frac_u, arg4=frac_v, arg5=src, arg6=bufw, stack+8=level // Win64: arg1=uptr, arg2=vptr, arg3=frac_u, arg4=frac_v, stack+40=src, stack+48=bufw, stack+56=level // // We map these to nearest CALLs, with order: u, v, src, bufw, level // Let's start by saving a bunch of registers. PUSH(R15); PUSH(R14); PUSH(R13); PUSH(R12); // Won't need frac_u/frac_v for a while. PUSH(arg4Reg); PUSH(arg3Reg); // Extra space to restore alignment and save resultReg for lerp. // TODO: Maybe use XMMs instead? SUB(64, R(RSP), Imm8(24)); MOV(64, R(R12), R(arg1Reg)); MOV(64, R(R13), R(arg2Reg)); #ifdef _WIN32 // First arg now starts at 24 (extra space) + 48 (pushed stack) + 8 (ret address) + 32 (shadow space) const int argOffset = 24 + 48 + 8 + 32; MOV(64, R(R14), MDisp(RSP, argOffset)); MOV(32, R(R15), MDisp(RSP, argOffset + 8)); // level is at argOffset + 16. #else MOV(64, R(R14), R(arg5Reg)); MOV(32, R(R15), R(arg6Reg)); // level is at 24 + 48 + 8. #endif // Early exit on !srcPtr. FixupBranch zeroSrc; if (id.hasInvalidPtr) { CMP(PTRBITS, R(R14), Imm8(0)); FixupBranch nonZeroSrc = J_CC(CC_NZ); XOR(32, R(RAX), R(RAX)); zeroSrc = J(true); SetJumpTarget(nonZeroSrc); } // At this point: // R12=uptr, R13=vptr, stack+24=frac_u, stack+32=frac_v, R14=src, R15=bufw, stack+X=level auto doNearestCall = [&](int off) { MOV(32, R(uReg), MDisp(R12, off)); MOV(32, R(vReg), MDisp(R13, off)); MOV(64, R(srcReg), R(R14)); MOV(32, R(bufwReg), R(R15)); // Leave level, we just always load from RAM. Separate CLUTs is uncommon. CALL(nearest); MOV(32, MDisp(RSP, off), R(resultReg)); }; doNearestCall(0); doNearestCall(4); doNearestCall(8); doNearestCall(12); // Convert TL, TR, BL, BR to floats for easier blending. if (!cpu_info.bSSE4_1) { PXOR(XMM0, R(XMM0)); } MOVD_xmm(fpScratchReg1, MDisp(RSP, 0)); MOVD_xmm(fpScratchReg2, MDisp(RSP, 4)); MOVD_xmm(fpScratchReg3, MDisp(RSP, 8)); MOVD_xmm(fpScratchReg4, MDisp(RSP, 12)); if (cpu_info.bSSE4_1) { PMOVZXBD(fpScratchReg1, R(fpScratchReg1)); PMOVZXBD(fpScratchReg2, R(fpScratchReg2)); PMOVZXBD(fpScratchReg3, R(fpScratchReg3)); PMOVZXBD(fpScratchReg4, R(fpScratchReg4)); } else { PUNPCKLBW(fpScratchReg1, R(XMM0)); PUNPCKLBW(fpScratchReg2, R(XMM0)); PUNPCKLBW(fpScratchReg3, R(XMM0)); PUNPCKLBW(fpScratchReg4, R(XMM0)); PUNPCKLWD(fpScratchReg1, R(XMM0)); PUNPCKLWD(fpScratchReg2, R(XMM0)); PUNPCKLWD(fpScratchReg3, R(XMM0)); PUNPCKLWD(fpScratchReg4, R(XMM0)); } CVTDQ2PS(fpScratchReg1, R(fpScratchReg1)); CVTDQ2PS(fpScratchReg2, R(fpScratchReg2)); CVTDQ2PS(fpScratchReg3, R(fpScratchReg3)); CVTDQ2PS(fpScratchReg4, R(fpScratchReg4)); // Okay, now multiply the R sides by frac_u, and L by (256 - frac_u)... MOVD_xmm(fpScratchReg5, MDisp(RSP, 24)); CVTDQ2PS(fpScratchReg5, R(fpScratchReg5)); SHUFPS(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0)); if (RipAccessible(by256)) { MULPS(fpScratchReg5, M(by256)); // rip accessible } else { Crash(); // TODO } MOVAPS(XMM0, M(ones)); SUBPS(XMM0, R(fpScratchReg5)); MULPS(fpScratchReg1, R(XMM0)); MULPS(fpScratchReg2, R(fpScratchReg5)); MULPS(fpScratchReg3, R(XMM0)); MULPS(fpScratchReg4, R(fpScratchReg5)); // Now set top=fpScratchReg1, bottom=fpScratchReg3. ADDPS(fpScratchReg1, R(fpScratchReg2)); ADDPS(fpScratchReg3, R(fpScratchReg4)); // Next, time for frac_v. MOVD_xmm(fpScratchReg5, MDisp(RSP, 32)); CVTDQ2PS(fpScratchReg5, R(fpScratchReg5)); SHUFPS(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0)); MULPS(fpScratchReg5, M(by256)); MOVAPS(XMM0, M(ones)); SUBPS(XMM0, R(fpScratchReg5)); MULPS(fpScratchReg1, R(XMM0)); MULPS(fpScratchReg3, R(fpScratchReg5)); // Still at the 255 scale, now we're interpolated. ADDPS(fpScratchReg1, R(fpScratchReg3)); // Time to convert back to a single 32 bit value. CVTPS2DQ(fpScratchReg1, R(fpScratchReg1)); PACKSSDW(fpScratchReg1, R(fpScratchReg1)); PACKUSWB(fpScratchReg1, R(fpScratchReg1)); MOVD_xmm(R(resultReg), fpScratchReg1); if (id.hasInvalidPtr) { SetJumpTarget(zeroSrc); } ADD(64, R(RSP), Imm8(24)); POP(arg3Reg); POP(arg4Reg); POP(R12); POP(R13); POP(R14); POP(R15); RET(); EndWrite(); return (LinearFunc)start; }