OpArg FPURegCache::GetDefaultLocation(int reg) const { if (reg < 32) { return MDisp(CTXREG, reg * 4); } else if (reg < 32 + 128) { return M(&mips->v[voffset[reg - 32]]); } else { return M(&tempValues[reg - 32 - 128]); } }
void Jit::JitSafeMem::DoSlowWrite(void *safeFunc, const OpArg src, int suboffset) { if (iaddr_ != (u32) -1) jit_->MOV(32, R(EAX), Imm32(iaddr_ + suboffset)); else jit_->LEA(32, EAX, MDisp(xaddr_, offset_ + suboffset)); jit_->ABI_CallFunctionAA(jit_->thunks.ProtectFunction(safeFunc, 2), src, R(EAX)); needsCheck_ = true; }
OpArg Jit::JitSafeMem::NextFastAddress(int suboffset) { if (jit_->gpr.IsImmediate(raddr_)) { u32 addr = jit_->gpr.GetImmediate32(raddr_) + offset_ + suboffset; #ifdef _M_IX86 return M(Memory::base + (addr & Memory::MEMVIEW32_MASK)); #else return MDisp(RBX, addr); #endif } #ifdef _M_IX86 return MDisp(xaddr_, (u32) Memory::base + offset_ + suboffset); #else return MComplex(RBX, xaddr_, SCALE_1, offset_ + suboffset); #endif }
OpArg Jit::JitSafeMem::PrepareMemoryOpArg(ReadType type) { // We may not even need to move into EAX as a temporary. bool needTemp = alignMask_ != 0xFFFFFFFF; #ifdef _M_IX86 // We always mask on 32 bit in fast memory mode. needTemp = needTemp || fast_; #endif if (jit_->gpr.R(raddr_).IsSimpleReg() && !needTemp) { jit_->gpr.MapReg(raddr_, true, false); xaddr_ = jit_->gpr.RX(raddr_); } else { jit_->MOV(32, R(EAX), jit_->gpr.R(raddr_)); xaddr_ = EAX; } MemCheckAsm(type); if (!fast_) { // Is it in physical ram? jit_->CMP(32, R(xaddr_), Imm32(PSP_GetKernelMemoryBase() - offset_)); tooLow_ = jit_->J_CC(CC_B); jit_->CMP(32, R(xaddr_), Imm32(PSP_GetUserMemoryEnd() - offset_ - (size_ - 1))); tooHigh_ = jit_->J_CC(CC_AE); // We may need to jump back up here. safe_ = jit_->GetCodePtr(); } else { #ifdef _M_IX86 jit_->AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK)); #endif } // TODO: This could be more optimal, but the common case is that we want xaddr_ not to include offset_. // Since we need to align them after add, we add and subtract. if (alignMask_ != 0xFFFFFFFF) { jit_->ADD(32, R(xaddr_), Imm32(offset_)); jit_->AND(32, R(xaddr_), Imm32(alignMask_)); jit_->SUB(32, R(xaddr_), Imm32(offset_)); } #ifdef _M_IX86 return MDisp(xaddr_, (u32) Memory::base + offset_); #else return MComplex(RBX, xaddr_, SCALE_1, offset_); #endif }
OpArg Jit::JitSafeMem::NextFastAddress(int suboffset) { if (jit_->gpr.IsImm(raddr_)) { u32 addr = (jit_->gpr.GetImm(raddr_) + offset_ + suboffset) & alignMask_; #ifdef _M_IX86 return M(Memory::base + (addr & Memory::MEMVIEW32_MASK)); #else return MDisp(RBX, addr); #endif } _dbg_assert_msg_(JIT, (suboffset & alignMask_) == suboffset, "suboffset must be aligned"); #ifdef _M_IX86 return MDisp(xaddr_, (u32) Memory::base + offset_ + suboffset); #else return MComplex(RBX, xaddr_, SCALE_1, offset_ + suboffset); #endif }
void Jit::JitSafeMem::DoSlowWrite(void *safeFunc, const OpArg src, int suboffset) { if (iaddr_ != (u32) -1) jit_->MOV(32, R(EAX), Imm32((iaddr_ + suboffset) & alignMask_)); else { jit_->LEA(32, EAX, MDisp(xaddr_, offset_ + suboffset)); if (alignMask_ != 0xFFFFFFFF) jit_->AND(32, R(EAX), Imm32(alignMask_)); } jit_->CallProtectedFunction(safeFunc, src, R(EAX)); needsCheck_ = true; }
// Zero cache line. void JitILBase::dcbz(UGeckoInstruction inst) { FALLBACK_IF(true); // TODO! #if 0 if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreOff) {Default(inst); return;} // turn off from debugger INSTRUCTION_START; MOV(32, R(EAX), gpr.R(inst.RB)); if (inst.RA) ADD(32, R(EAX), gpr.R(inst.RA)); AND(32, R(EAX), Imm32(~31)); PXOR(XMM0, R(XMM0)); #if _M_X86_64 MOVAPS(MComplex(EBX, EAX, SCALE_1, 0), XMM0); MOVAPS(MComplex(EBX, EAX, SCALE_1, 16), XMM0); #else AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK)); MOVAPS(MDisp(EAX, (u32)Memory::base), XMM0); MOVAPS(MDisp(EAX, (u32)Memory::base + 16), XMM0); #endif #endif }
OpArg Jit::JitSafeMem::PrepareMemoryOpArg() { // We may not even need to move into EAX as a temporary. // TODO: Except on x86 in fastmem mode. if (jit_->gpr.R(raddr_).IsSimpleReg()) { jit_->gpr.BindToRegister(raddr_, true, false); xaddr_ = jit_->gpr.RX(raddr_); } else { jit_->MOV(32, R(EAX), jit_->gpr.R(raddr_)); xaddr_ = EAX; } if (!g_Config.bFastMemory) { // Is it in physical ram? jit_->CMP(32, R(xaddr_), Imm32(PSP_GetKernelMemoryBase() - offset_)); tooLow_ = jit_->J_CC(CC_L); jit_->CMP(32, R(xaddr_), Imm32(PSP_GetUserMemoryEnd() - offset_)); tooHigh_ = jit_->J_CC(CC_GE); // We may need to jump back up here. safe_ = jit_->GetCodePtr(); } else { #ifdef _M_IX86 // Need to modify it, too bad. if (xaddr_ != EAX) jit_->MOV(32, R(EAX), R(xaddr_)); jit_->AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK)); xaddr_ = EAX; #endif } #ifdef _M_IX86 return MDisp(xaddr_, (u32) Memory::base + offset_); #else return MComplex(RBX, xaddr_, SCALE_1, offset_); #endif }
void Jit::JitSafeMem::NextSlowRead(void *safeFunc, int suboffset) { _dbg_assert_msg_(JIT, !g_Config.bFastMemory, "NextSlowRead() called in fast memory mode?"); // For simplicity, do nothing for 0. We already read in PrepareSlowRead(). if (suboffset == 0) return; if (jit_->gpr.IsImmediate(raddr_)) { _dbg_assert_msg_(JIT, !Memory::IsValidAddress(iaddr_), "NextSlowRead() for a valid immediate address?"); jit_->MOV(32, R(EAX), Imm32(iaddr_ + suboffset)); } // For GPR, if xaddr_ was the dest register, this will be wrong. Don't use in GPR. else jit_->LEA(32, EAX, MDisp(xaddr_, offset_ + suboffset)); jit_->ABI_CallFunctionA(jit_->thunks.ProtectFunction(safeFunc, 1), R(EAX)); }
bool Jit::JitSafeMem::PrepareRead(OpArg &src) { if (iaddr_ != (u32) -1) { if (Memory::IsValidAddress(iaddr_)) { #ifdef _M_IX86 src = M(Memory::base + (iaddr_ & Memory::MEMVIEW32_MASK)); #else src = MDisp(RBX, iaddr_); #endif return true; } else return false; } else src = PrepareMemoryOpArg(); return true; }
bool Jit::JitSafeMem::PrepareWrite(OpArg &dest) { // If it's an immediate, we can do the write if valid. if (iaddr_ != (u32) -1) { if (Memory::IsValidAddress(iaddr_)) { #ifdef _M_IX86 dest = M(Memory::base + (iaddr_ & Memory::MEMVIEW32_MASK)); #else dest = MDisp(RBX, iaddr_); #endif return true; } else return false; } // Otherwise, we always can do the write (conditionally.) else dest = PrepareMemoryOpArg(); return true; }
bool Jit::JitSafeMem::PrepareRead(OpArg &src, int size) { size_ = size; if (iaddr_ != (u32) -1) { if (ImmValid()) { MemCheckImm(MEM_READ); #ifdef _M_IX86 src = M(Memory::base + (iaddr_ & Memory::MEMVIEW32_MASK & alignMask_)); #else src = MDisp(RBX, iaddr_ & alignMask_); #endif return true; } else return false; } else src = PrepareMemoryOpArg(MEM_READ); return true; }
bool Jit::JitSafeMem::PrepareSlowRead(void *safeFunc) { if (!g_Config.bFastMemory) { if (iaddr_ != (u32) -1) { // No slow read necessary. if (Memory::IsValidAddress(iaddr_)) return false; jit_->MOV(32, R(EAX), Imm32(iaddr_)); } else { PrepareSlowAccess(); jit_->LEA(32, EAX, MDisp(xaddr_, offset_)); } jit_->ABI_CallFunctionA(jit_->thunks.ProtectFunction(safeFunc, 1), R(EAX)); needsCheck_ = true; return true; } else return false; }
void Jit::JitSafeMem::NextSlowRead(void *safeFunc, int suboffset) { _dbg_assert_msg_(JIT, !fast_, "NextSlowRead() called in fast memory mode?"); // For simplicity, do nothing for 0. We already read in PrepareSlowRead(). if (suboffset == 0) return; if (jit_->gpr.IsImm(raddr_)) { _dbg_assert_msg_(JIT, !Memory::IsValidAddress(iaddr_ + suboffset), "NextSlowRead() for an invalid immediate address?"); jit_->MOV(32, R(EAX), Imm32((iaddr_ + suboffset) & alignMask_)); } // For GPR, if xaddr_ was the dest register, this will be wrong. Don't use in GPR. else { jit_->LEA(32, EAX, MDisp(xaddr_, offset_ + suboffset)); if (alignMask_ != 0xFFFFFFFF) jit_->AND(32, R(EAX), Imm32(alignMask_)); } jit_->CallProtectedFunction(safeFunc, R(EAX)); }
LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) { _assert_msg_(G3D, id.linear, "Linear should be set on sampler id"); BeginWrite(); // We'll first write the nearest sampler, which we will CALL. // This may differ slightly based on the "linear" flag. const u8 *nearest = AlignCode16(); if (!Jit_ReadTextureFormat(id)) { EndWrite(); SetCodePtr(const_cast<u8 *>(nearest)); return nullptr; } RET(); // Now the actual linear func, which is exposed externally. const u8 *start = AlignCode16(); // NOTE: This doesn't use the general register mapping. // POSIX: arg1=uptr, arg2=vptr, arg3=frac_u, arg4=frac_v, arg5=src, arg6=bufw, stack+8=level // Win64: arg1=uptr, arg2=vptr, arg3=frac_u, arg4=frac_v, stack+40=src, stack+48=bufw, stack+56=level // // We map these to nearest CALLs, with order: u, v, src, bufw, level // Let's start by saving a bunch of registers. PUSH(R15); PUSH(R14); PUSH(R13); PUSH(R12); // Won't need frac_u/frac_v for a while. PUSH(arg4Reg); PUSH(arg3Reg); // Extra space to restore alignment and save resultReg for lerp. // TODO: Maybe use XMMs instead? SUB(64, R(RSP), Imm8(24)); MOV(64, R(R12), R(arg1Reg)); MOV(64, R(R13), R(arg2Reg)); #ifdef _WIN32 // First arg now starts at 24 (extra space) + 48 (pushed stack) + 8 (ret address) + 32 (shadow space) const int argOffset = 24 + 48 + 8 + 32; MOV(64, R(R14), MDisp(RSP, argOffset)); MOV(32, R(R15), MDisp(RSP, argOffset + 8)); // level is at argOffset + 16. #else MOV(64, R(R14), R(arg5Reg)); MOV(32, R(R15), R(arg6Reg)); // level is at 24 + 48 + 8. #endif // Early exit on !srcPtr. FixupBranch zeroSrc; if (id.hasInvalidPtr) { CMP(PTRBITS, R(R14), Imm8(0)); FixupBranch nonZeroSrc = J_CC(CC_NZ); XOR(32, R(RAX), R(RAX)); zeroSrc = J(true); SetJumpTarget(nonZeroSrc); } // At this point: // R12=uptr, R13=vptr, stack+24=frac_u, stack+32=frac_v, R14=src, R15=bufw, stack+X=level auto doNearestCall = [&](int off) { MOV(32, R(uReg), MDisp(R12, off)); MOV(32, R(vReg), MDisp(R13, off)); MOV(64, R(srcReg), R(R14)); MOV(32, R(bufwReg), R(R15)); // Leave level, we just always load from RAM. Separate CLUTs is uncommon. CALL(nearest); MOV(32, MDisp(RSP, off), R(resultReg)); }; doNearestCall(0); doNearestCall(4); doNearestCall(8); doNearestCall(12); // Convert TL, TR, BL, BR to floats for easier blending. if (!cpu_info.bSSE4_1) { PXOR(XMM0, R(XMM0)); } MOVD_xmm(fpScratchReg1, MDisp(RSP, 0)); MOVD_xmm(fpScratchReg2, MDisp(RSP, 4)); MOVD_xmm(fpScratchReg3, MDisp(RSP, 8)); MOVD_xmm(fpScratchReg4, MDisp(RSP, 12)); if (cpu_info.bSSE4_1) { PMOVZXBD(fpScratchReg1, R(fpScratchReg1)); PMOVZXBD(fpScratchReg2, R(fpScratchReg2)); PMOVZXBD(fpScratchReg3, R(fpScratchReg3)); PMOVZXBD(fpScratchReg4, R(fpScratchReg4)); } else { PUNPCKLBW(fpScratchReg1, R(XMM0)); PUNPCKLBW(fpScratchReg2, R(XMM0)); PUNPCKLBW(fpScratchReg3, R(XMM0)); PUNPCKLBW(fpScratchReg4, R(XMM0)); PUNPCKLWD(fpScratchReg1, R(XMM0)); PUNPCKLWD(fpScratchReg2, R(XMM0)); PUNPCKLWD(fpScratchReg3, R(XMM0)); PUNPCKLWD(fpScratchReg4, R(XMM0)); } CVTDQ2PS(fpScratchReg1, R(fpScratchReg1)); CVTDQ2PS(fpScratchReg2, R(fpScratchReg2)); CVTDQ2PS(fpScratchReg3, R(fpScratchReg3)); CVTDQ2PS(fpScratchReg4, R(fpScratchReg4)); // Okay, now multiply the R sides by frac_u, and L by (256 - frac_u)... MOVD_xmm(fpScratchReg5, MDisp(RSP, 24)); CVTDQ2PS(fpScratchReg5, R(fpScratchReg5)); SHUFPS(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0)); if (RipAccessible(by256)) { MULPS(fpScratchReg5, M(by256)); // rip accessible } else { Crash(); // TODO } MOVAPS(XMM0, M(ones)); SUBPS(XMM0, R(fpScratchReg5)); MULPS(fpScratchReg1, R(XMM0)); MULPS(fpScratchReg2, R(fpScratchReg5)); MULPS(fpScratchReg3, R(XMM0)); MULPS(fpScratchReg4, R(fpScratchReg5)); // Now set top=fpScratchReg1, bottom=fpScratchReg3. ADDPS(fpScratchReg1, R(fpScratchReg2)); ADDPS(fpScratchReg3, R(fpScratchReg4)); // Next, time for frac_v. MOVD_xmm(fpScratchReg5, MDisp(RSP, 32)); CVTDQ2PS(fpScratchReg5, R(fpScratchReg5)); SHUFPS(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0)); MULPS(fpScratchReg5, M(by256)); MOVAPS(XMM0, M(ones)); SUBPS(XMM0, R(fpScratchReg5)); MULPS(fpScratchReg1, R(XMM0)); MULPS(fpScratchReg3, R(fpScratchReg5)); // Still at the 255 scale, now we're interpolated. ADDPS(fpScratchReg1, R(fpScratchReg3)); // Time to convert back to a single 32 bit value. CVTPS2DQ(fpScratchReg1, R(fpScratchReg1)); PACKSSDW(fpScratchReg1, R(fpScratchReg1)); PACKUSWB(fpScratchReg1, R(fpScratchReg1)); MOVD_xmm(R(resultReg), fpScratchReg1); if (id.hasInvalidPtr) { SetJumpTarget(zeroSrc); } ADD(64, R(RSP), Imm8(24)); POP(arg3Reg); POP(arg4Reg); POP(R12); POP(R13); POP(R14); POP(R15); RET(); EndWrite(); return (LinearFunc)start; }
void Jit64::stfd(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStoreFloatingOff); FALLBACK_IF(js.memcheck || !inst.RA); int s = inst.RS; int a = inst.RA; u32 mem_mask = Memory::ADDR_MASK_HW_ACCESS; if (Core::g_CoreStartupParameter.bMMU || Core::g_CoreStartupParameter.bTLBHack) { mem_mask |= Memory::ADDR_MASK_MEM1; } #ifdef ENABLE_MEM_CHECK if (Core::g_CoreStartupParameter.bEnableDebugging) { mem_mask |= Memory::EXRAM_MASK; } #endif gpr.FlushLockX(ABI_PARAM1); gpr.Lock(a); fpr.Lock(s); gpr.BindToRegister(a, true, false); s32 offset = (s32)(s16)inst.SIMM_16; LEA(32, ABI_PARAM1, MDisp(gpr.R(a).GetSimpleReg(), offset)); TEST(32, R(ABI_PARAM1), Imm32(mem_mask)); FixupBranch safe = J_CC(CC_NZ); // Fast routine if (cpu_info.bSSSE3) { MOVAPD(XMM0, fpr.R(s)); PSHUFB(XMM0, M((void*)bswapShuffle1x8)); #if _M_X86_64 MOVQ_xmm(MComplex(RBX, ABI_PARAM1, SCALE_1, 0), XMM0); #else AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK)); MOVQ_xmm(MDisp(ABI_PARAM1, (u32)Memory::base), XMM0); #endif } else { MOVAPD(XMM0, fpr.R(s)); MOVD_xmm(R(EAX), XMM0); UnsafeWriteRegToReg(EAX, ABI_PARAM1, 32, 4); PSRLQ(XMM0, 32); MOVD_xmm(R(EAX), XMM0); UnsafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0); } FixupBranch exit = J(true); SetJumpTarget(safe); // Safe but slow routine MOVAPD(XMM0, fpr.R(s)); PSRLQ(XMM0, 32); MOVD_xmm(R(EAX), XMM0); SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0, RegistersInUse() | (1 << (16 + XMM0))); MOVAPD(XMM0, fpr.R(s)); MOVD_xmm(R(EAX), XMM0); LEA(32, ABI_PARAM1, MDisp(gpr.R(a).GetSimpleReg(), offset)); SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 4, RegistersInUse()); SetJumpTarget(exit); gpr.UnlockAll(); gpr.UnlockAllX(); fpr.UnlockAll(); }
void Jit64::lfd(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStoreFloatingOff); FALLBACK_IF(js.memcheck || !inst.RA); int d = inst.RD; int a = inst.RA; s32 offset = (s32)(s16)inst.SIMM_16; gpr.FlushLockX(ABI_PARAM1); gpr.Lock(a); MOV(32, R(ABI_PARAM1), gpr.R(a)); // TODO - optimize. This has to load the previous value - upper double should stay unmodified. fpr.Lock(d); fpr.BindToRegister(d, true); X64Reg xd = fpr.RX(d); if (cpu_info.bSSSE3) { #if _M_X86_64 MOVQ_xmm(XMM0, MComplex(RBX, ABI_PARAM1, SCALE_1, offset)); #else AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK)); MOVQ_xmm(XMM0, MDisp(ABI_PARAM1, (u32)Memory::base + offset)); #endif PSHUFB(XMM0, M((void *)bswapShuffle1x8Dupe)); MOVSD(xd, R(XMM0)); } else { #if _M_X86_64 LoadAndSwap(64, EAX, MComplex(RBX, ABI_PARAM1, SCALE_1, offset)); MOV(64, M(&temp64), R(EAX)); MEMCHECK_START MOVSD(XMM0, M(&temp64)); MOVSD(xd, R(XMM0)); MEMCHECK_END #else AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK)); MOV(32, R(EAX), MDisp(ABI_PARAM1, (u32)Memory::base + offset)); BSWAP(32, EAX); MOV(32, M((void*)((u8 *)&temp64+4)), R(EAX)); MEMCHECK_START MOV(32, R(EAX), MDisp(ABI_PARAM1, (u32)Memory::base + offset + 4)); BSWAP(32, EAX); MOV(32, M(&temp64), R(EAX)); MOVSD(XMM0, M(&temp64)); MOVSD(xd, R(XMM0)); MEMCHECK_END #endif } gpr.UnlockAll(); gpr.UnlockAllX(); fpr.UnlockAll(); }