bool SamplerJitCache::Jit_GetTexDataSwizzled4() { // Get the horizontal tile pos into tempReg1. LEA(32, tempReg1, MScaled(uReg, SCALE_4, 0)); // Note: imm8 sign extends negative. AND(32, R(tempReg1), Imm8(~127)); // Add vertical offset inside tile to tempReg1. LEA(32, tempReg2, MScaled(vReg, SCALE_4, 0)); AND(32, R(tempReg2), Imm8(31)); LEA(32, tempReg1, MComplex(tempReg1, tempReg2, SCALE_4, 0)); // Add srcReg, since we'll need it at some point. ADD(64, R(tempReg1), R(srcReg)); // Now find the vertical tile pos, and add to tempReg1. SHR(32, R(vReg), Imm8(3)); LEA(32, EAX, MScaled(bufwReg, SCALE_4, 0)); MUL(32, R(vReg)); ADD(64, R(tempReg1), R(EAX)); // Last and possible also least, the horizontal offset inside the tile. AND(32, R(uReg), Imm8(31)); SHR(32, R(uReg), Imm8(1)); MOV(8, R(resultReg), MRegSum(tempReg1, uReg)); FixupBranch skipNonZero = J_CC(CC_NC); // If the horizontal offset was odd, take the upper 4. SHR(8, R(resultReg), Imm8(4)); SetJumpTarget(skipNonZero); // Zero out the rest of the bits. AND(32, R(resultReg), Imm8(0x0F)); return true; }
bool SamplerJitCache::Jit_GetTexData(const SamplerID &id, int bitsPerTexel) { if (id.swizzle) { return Jit_GetTexDataSwizzled(id, bitsPerTexel); } // srcReg might be EDX, so let's copy that before we multiply. switch (bitsPerTexel) { case 32: case 16: case 8: LEA(64, tempReg1, MComplex(srcReg, uReg, bitsPerTexel / 8, 0)); break; case 4: { XOR(32, R(tempReg2), R(tempReg2)); SHR(32, R(uReg), Imm8(1)); FixupBranch skip = J_CC(CC_NC); // Track whether we shifted a 1 off or not. MOV(32, R(tempReg2), Imm32(4)); SetJumpTarget(skip); LEA(64, tempReg1, MRegSum(srcReg, uReg)); break; } default: return false; } MOV(32, R(EAX), R(vReg)); MUL(32, R(bufwReg)); switch (bitsPerTexel) { case 32: case 16: case 8: MOVZX(32, bitsPerTexel, resultReg, MComplex(tempReg1, RAX, bitsPerTexel / 8, 0)); break; case 4: { SHR(32, R(RAX), Imm8(1)); MOV(8, R(resultReg), MRegSum(tempReg1, RAX)); // RCX is now free. MOV(8, R(RCX), R(tempReg2)); SHR(8, R(resultReg), R(RCX)); // Zero out any bits not shifted off. AND(32, R(resultReg), Imm8(0x0F)); break; } default: return false; } return true; }
bool SamplerJitCache::Jit_ReadClutColor(const SamplerID &id) { if (!id.useSharedClut) { // TODO: Need to load from RAM, always. if (id.linear) { #ifdef _WIN32 const int argOffset = 24 + 48 + 8 + 32; // Extra 8 to account for CALL. MOV(32, R(tempReg2), MDisp(RSP, argOffset + 16 + 8)); #else // Extra 8 to account for CALL. MOV(32, R(tempReg2), MDisp(RSP, 24 + 48 + 8 + 8)); #endif LEA(32, tempReg2, MScaled(tempReg2, SCALE_4, 0)); } else { #ifdef _WIN32 // The argument was saved on the stack. MOV(32, R(tempReg2), MDisp(RSP, 40)); LEA(32, tempReg2, MScaled(tempReg2, SCALE_4, 0)); #else // We need to multiply by 16 and add, LEA allows us to copy too. LEA(32, tempReg2, MScaled(levelReg, SCALE_4, 0)); #endif } // Second step of the multiply by 16 (since we only multiplied by 4 before.) LEA(64, resultReg, MComplex(resultReg, tempReg2, SCALE_4, 0)); } MOV(PTRBITS, R(tempReg1), ImmPtr(clut)); switch ((GEPaletteFormat)id.clutfmt) { case GE_CMODE_16BIT_BGR5650: MOVZX(32, 16, resultReg, MComplex(tempReg1, resultReg, SCALE_2, 0)); return Jit_Decode5650(); case GE_CMODE_16BIT_ABGR5551: MOVZX(32, 16, resultReg, MComplex(tempReg1, resultReg, SCALE_2, 0)); return Jit_Decode5551(); case GE_CMODE_16BIT_ABGR4444: MOVZX(32, 16, resultReg, MComplex(tempReg1, resultReg, SCALE_2, 0)); return Jit_Decode4444(); case GE_CMODE_32BIT_ABGR8888: MOV(32, R(resultReg), MComplex(tempReg1, resultReg, SCALE_4, 0)); return true; default: return false; } }
bool SamplerJitCache::Jit_GetTexDataSwizzled(const SamplerID &id, int bitsPerTexel) { if (bitsPerTexel == 4) { // Specialized implementation. return Jit_GetTexDataSwizzled4(); } LEA(32, tempReg1, MScaled(vReg, SCALE_4, 0)); AND(32, R(tempReg1), Imm8(31)); AND(32, R(vReg), Imm8(~7)); MOV(32, R(tempReg2), R(uReg)); MOV(32, R(resultReg), R(uReg)); switch (bitsPerTexel) { case 32: SHR(32, R(resultReg), Imm8(2)); break; case 16: SHR(32, R(vReg), Imm8(1)); SHR(32, R(tempReg2), Imm8(1)); SHR(32, R(resultReg), Imm8(3)); break; case 8: SHR(32, R(vReg), Imm8(2)); SHR(32, R(tempReg2), Imm8(2)); SHR(32, R(resultReg), Imm8(4)); break; default: return false; } AND(32, R(tempReg2), Imm8(3)); SHL(32, R(resultReg), Imm8(5)); ADD(32, R(tempReg1), R(tempReg2)); ADD(32, R(tempReg1), R(resultReg)); // We may clobber srcReg in the MUL, so let's grab it now. LEA(64, tempReg1, MComplex(srcReg, tempReg1, SCALE_4, 0)); LEA(32, EAX, MScaled(bufwReg, SCALE_4, 0)); MUL(32, R(vReg)); switch (bitsPerTexel) { case 32: MOV(bitsPerTexel, R(resultReg), MRegSum(tempReg1, EAX)); break; case 16: AND(32, R(uReg), Imm8(1)); // Multiply by two by just adding twice. ADD(32, R(EAX), R(uReg)); ADD(32, R(EAX), R(uReg)); MOVZX(32, bitsPerTexel, resultReg, MRegSum(tempReg1, EAX)); break; case 8: AND(32, R(uReg), Imm8(3)); ADD(32, R(EAX), R(uReg)); MOVZX(32, bitsPerTexel, resultReg, MRegSum(tempReg1, EAX)); break; default: return false; } return true; }
void Jit64::stfd(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStoreFloatingOff); FALLBACK_IF(js.memcheck || !inst.RA); int s = inst.RS; int a = inst.RA; u32 mem_mask = Memory::ADDR_MASK_HW_ACCESS; if (Core::g_CoreStartupParameter.bMMU || Core::g_CoreStartupParameter.bTLBHack) { mem_mask |= Memory::ADDR_MASK_MEM1; } #ifdef ENABLE_MEM_CHECK if (Core::g_CoreStartupParameter.bEnableDebugging) { mem_mask |= Memory::EXRAM_MASK; } #endif gpr.FlushLockX(ABI_PARAM1); gpr.Lock(a); fpr.Lock(s); gpr.BindToRegister(a, true, false); s32 offset = (s32)(s16)inst.SIMM_16; LEA(32, ABI_PARAM1, MDisp(gpr.R(a).GetSimpleReg(), offset)); TEST(32, R(ABI_PARAM1), Imm32(mem_mask)); FixupBranch safe = J_CC(CC_NZ); // Fast routine if (cpu_info.bSSSE3) { MOVAPD(XMM0, fpr.R(s)); PSHUFB(XMM0, M((void*)bswapShuffle1x8)); #if _M_X86_64 MOVQ_xmm(MComplex(RBX, ABI_PARAM1, SCALE_1, 0), XMM0); #else AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK)); MOVQ_xmm(MDisp(ABI_PARAM1, (u32)Memory::base), XMM0); #endif } else { MOVAPD(XMM0, fpr.R(s)); MOVD_xmm(R(EAX), XMM0); UnsafeWriteRegToReg(EAX, ABI_PARAM1, 32, 4); PSRLQ(XMM0, 32); MOVD_xmm(R(EAX), XMM0); UnsafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0); } FixupBranch exit = J(true); SetJumpTarget(safe); // Safe but slow routine MOVAPD(XMM0, fpr.R(s)); PSRLQ(XMM0, 32); MOVD_xmm(R(EAX), XMM0); SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0, RegistersInUse() | (1 << (16 + XMM0))); MOVAPD(XMM0, fpr.R(s)); MOVD_xmm(R(EAX), XMM0); LEA(32, ABI_PARAM1, MDisp(gpr.R(a).GetSimpleReg(), offset)); SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 4, RegistersInUse()); SetJumpTarget(exit); gpr.UnlockAll(); gpr.UnlockAllX(); fpr.UnlockAll(); }