// Zero cache line. void JitILBase::dcbz(UGeckoInstruction inst) { FALLBACK_IF(true); // TODO! #if 0 if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreOff) { Default(inst); return; } INSTRUCTION_START; MOV(32, R(EAX), gpr.R(inst.RB)); if (inst.RA) ADD(32, R(EAX), gpr.R(inst.RA)); AND(32, R(EAX), Imm32(~31)); PXOR(XMM0, R(XMM0)); #if _M_X86_64 MOVAPS(MComplex(EBX, EAX, SCALE_1, 0), XMM0); MOVAPS(MComplex(EBX, EAX, SCALE_1, 16), XMM0); #else AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK)); MOVAPS(MDisp(EAX, (u32)Memory::base), XMM0); MOVAPS(MDisp(EAX, (u32)Memory::base + 16), XMM0); #endif #endif }
bool SamplerJitCache::Jit_GetTexData(const SamplerID &id, int bitsPerTexel) { if (id.swizzle) { return Jit_GetTexDataSwizzled(id, bitsPerTexel); } // srcReg might be EDX, so let's copy that before we multiply. switch (bitsPerTexel) { case 32: case 16: case 8: LEA(64, tempReg1, MComplex(srcReg, uReg, bitsPerTexel / 8, 0)); break; case 4: { XOR(32, R(tempReg2), R(tempReg2)); SHR(32, R(uReg), Imm8(1)); FixupBranch skip = J_CC(CC_NC); // Track whether we shifted a 1 off or not. MOV(32, R(tempReg2), Imm32(4)); SetJumpTarget(skip); LEA(64, tempReg1, MRegSum(srcReg, uReg)); break; } default: return false; } MOV(32, R(EAX), R(vReg)); MUL(32, R(bufwReg)); switch (bitsPerTexel) { case 32: case 16: case 8: MOVZX(32, bitsPerTexel, resultReg, MComplex(tempReg1, RAX, bitsPerTexel / 8, 0)); break; case 4: { SHR(32, R(RAX), Imm8(1)); MOV(8, R(resultReg), MRegSum(tempReg1, RAX)); // RCX is now free. MOV(8, R(RCX), R(tempReg2)); SHR(8, R(resultReg), R(RCX)); // Zero out any bits not shifted off. AND(32, R(resultReg), Imm8(0x0F)); break; } default: return false; } return true; }
void Jit::Comp_FPULS(u32 op) { CONDITIONAL_DISABLE; if (!g_Config.bFastMemory) { DISABLE; } s32 offset = (s16)(op&0xFFFF); int ft = ((op>>16)&0x1f); int rs = _RS; // u32 addr = R(rs) + offset; switch(op >> 26) { case 49: //FI(ft) = Memory::Read_U32(addr); break; //lwc1 gpr.Lock(rs); fpr.Lock(ft); fpr.BindToRegister(ft, false, true); #ifdef _M_IX86 MOV(32, R(EAX), gpr.R(rs)); AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK)); MOVSS(fpr.RX(ft), MDisp(EAX, (u32)Memory::base + offset)); #else MOV(32, R(EAX), gpr.R(rs)); MOVSS(fpr.RX(ft), MComplex(RBX, RAX, SCALE_1, offset)); #endif gpr.UnlockAll(); fpr.UnlockAll(); break; case 57: //Memory::Write_U32(FI(ft), addr); break; //swc1 gpr.Lock(rs); fpr.Lock(ft); fpr.BindToRegister(ft, true, false); #ifdef _M_IX86 MOV(32, R(EAX), gpr.R(rs)); AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK)); MOVSS(MDisp(EAX, (u32)Memory::base + offset), fpr.RX(ft)); #else MOV(32, R(EAX), gpr.R(rs)); MOVSS(MComplex(RBX, RAX, SCALE_1, offset), fpr.RX(ft)); #endif gpr.UnlockAll(); fpr.UnlockAll(); break; default: _dbg_assert_msg_(CPU,0,"Trying to interpret FPULS instruction that can't be interpreted"); break; } }
bool SamplerJitCache::Jit_ReadClutColor(const SamplerID &id) { if (!id.useSharedClut) { // TODO: Need to load from RAM, always. if (id.linear) { #ifdef _WIN32 const int argOffset = 24 + 48 + 8 + 32; // Extra 8 to account for CALL. MOV(32, R(tempReg2), MDisp(RSP, argOffset + 16 + 8)); #else // Extra 8 to account for CALL. MOV(32, R(tempReg2), MDisp(RSP, 24 + 48 + 8 + 8)); #endif LEA(32, tempReg2, MScaled(tempReg2, SCALE_4, 0)); } else { #ifdef _WIN32 // The argument was saved on the stack. MOV(32, R(tempReg2), MDisp(RSP, 40)); LEA(32, tempReg2, MScaled(tempReg2, SCALE_4, 0)); #else // We need to multiply by 16 and add, LEA allows us to copy too. LEA(32, tempReg2, MScaled(levelReg, SCALE_4, 0)); #endif } // Second step of the multiply by 16 (since we only multiplied by 4 before.) LEA(64, resultReg, MComplex(resultReg, tempReg2, SCALE_4, 0)); } MOV(PTRBITS, R(tempReg1), ImmPtr(clut)); switch ((GEPaletteFormat)id.clutfmt) { case GE_CMODE_16BIT_BGR5650: MOVZX(32, 16, resultReg, MComplex(tempReg1, resultReg, SCALE_2, 0)); return Jit_Decode5650(); case GE_CMODE_16BIT_ABGR5551: MOVZX(32, 16, resultReg, MComplex(tempReg1, resultReg, SCALE_2, 0)); return Jit_Decode5551(); case GE_CMODE_16BIT_ABGR4444: MOVZX(32, 16, resultReg, MComplex(tempReg1, resultReg, SCALE_2, 0)); return Jit_Decode4444(); case GE_CMODE_32BIT_ABGR8888: MOV(32, R(resultReg), MComplex(tempReg1, resultReg, SCALE_4, 0)); return true; default: return false; } }
std::shared_ptr<FunctionTree> CoherentIntensity::createFunctionTree(const ParameterList &DataSample, const std::string &suffix) const { size_t n = DataSample.mDoubleValue(0)->values().size(); auto NodeName = "CoherentIntensity(" + Name + ")" + suffix; auto tr = std::make_shared<FunctionTree>( NodeName, MDouble("", n), std::make_shared<AbsSquare>(ParType::MDOUBLE)); tr->createNode("SumOfAmplitudes", MComplex("", n), std::make_shared<AddAll>(ParType::MCOMPLEX), NodeName); for (auto i : Amplitudes) { std::shared_ptr<ComPWA::FunctionTree> resTree = i->createFunctionTree(DataSample, suffix); if (!resTree->sanityCheck()) throw std::runtime_error("CoherentIntensity::createFunctionTree(): tree " "didn't pass sanity check!"); resTree->parameter(); tr->insertTree(resTree, "SumOfAmplitudes"); } return tr; }
bool SamplerJitCache::Jit_GetTexDataSwizzled4() { // Get the horizontal tile pos into tempReg1. LEA(32, tempReg1, MScaled(uReg, SCALE_4, 0)); // Note: imm8 sign extends negative. AND(32, R(tempReg1), Imm8(~127)); // Add vertical offset inside tile to tempReg1. LEA(32, tempReg2, MScaled(vReg, SCALE_4, 0)); AND(32, R(tempReg2), Imm8(31)); LEA(32, tempReg1, MComplex(tempReg1, tempReg2, SCALE_4, 0)); // Add srcReg, since we'll need it at some point. ADD(64, R(tempReg1), R(srcReg)); // Now find the vertical tile pos, and add to tempReg1. SHR(32, R(vReg), Imm8(3)); LEA(32, EAX, MScaled(bufwReg, SCALE_4, 0)); MUL(32, R(vReg)); ADD(64, R(tempReg1), R(EAX)); // Last and possible also least, the horizontal offset inside the tile. AND(32, R(uReg), Imm8(31)); SHR(32, R(uReg), Imm8(1)); MOV(8, R(resultReg), MRegSum(tempReg1, uReg)); FixupBranch skipNonZero = J_CC(CC_NC); // If the horizontal offset was odd, take the upper 4. SHR(8, R(resultReg), Imm8(4)); SetJumpTarget(skipNonZero); // Zero out the rest of the bits. AND(32, R(resultReg), Imm8(0x0F)); return true; }
OpArg Jit::JitSafeMem::PrepareMemoryOpArg(ReadType type) { // We may not even need to move into EAX as a temporary. bool needTemp = alignMask_ != 0xFFFFFFFF; #ifdef _M_IX86 // We always mask on 32 bit in fast memory mode. needTemp = needTemp || fast_; #endif if (jit_->gpr.R(raddr_).IsSimpleReg() && !needTemp) { jit_->gpr.MapReg(raddr_, true, false); xaddr_ = jit_->gpr.RX(raddr_); } else { jit_->MOV(32, R(EAX), jit_->gpr.R(raddr_)); xaddr_ = EAX; } MemCheckAsm(type); if (!fast_) { // Is it in physical ram? jit_->CMP(32, R(xaddr_), Imm32(PSP_GetKernelMemoryBase() - offset_)); tooLow_ = jit_->J_CC(CC_B); jit_->CMP(32, R(xaddr_), Imm32(PSP_GetUserMemoryEnd() - offset_ - (size_ - 1))); tooHigh_ = jit_->J_CC(CC_AE); // We may need to jump back up here. safe_ = jit_->GetCodePtr(); } else { #ifdef _M_IX86 jit_->AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK)); #endif } // TODO: This could be more optimal, but the common case is that we want xaddr_ not to include offset_. // Since we need to align them after add, we add and subtract. if (alignMask_ != 0xFFFFFFFF) { jit_->ADD(32, R(xaddr_), Imm32(offset_)); jit_->AND(32, R(xaddr_), Imm32(alignMask_)); jit_->SUB(32, R(xaddr_), Imm32(offset_)); } #ifdef _M_IX86 return MDisp(xaddr_, (u32) Memory::base + offset_); #else return MComplex(RBX, xaddr_, SCALE_1, offset_); #endif }
// Zero cache line. void JitILBase::dcbz(UGeckoInstruction inst) { FALLBACK_IF(true); // TODO! #if 0 if (SConfig::GetInstance().bJITOff || SConfig::GetInstance().bJITLoadStoreOff) { Default(inst); return; } INSTRUCTION_START; MOV(32, R(RSCRATCH), gpr.R(inst.RB)); if (inst.RA) ADD(32, R(RSCRATCH), gpr.R(inst.RA)); AND(32, R(RSCRATCH), Imm32(~31)); PXOR(XMM0, R(XMM0)); MOVAPS(MComplex(RMEM, RSCRATCH, SCALE_1, 0), XMM0); MOVAPS(MComplex(RMEM, RSCRATCH, SCALE_1, 16), XMM0); #endif }
OpArg Jit::JitSafeMem::PrepareMemoryOpArg() { // We may not even need to move into EAX as a temporary. // TODO: Except on x86 in fastmem mode. if (jit_->gpr.R(raddr_).IsSimpleReg()) { jit_->gpr.BindToRegister(raddr_, true, false); xaddr_ = jit_->gpr.RX(raddr_); } else { jit_->MOV(32, R(EAX), jit_->gpr.R(raddr_)); xaddr_ = EAX; } if (!g_Config.bFastMemory) { // Is it in physical ram? jit_->CMP(32, R(xaddr_), Imm32(PSP_GetKernelMemoryBase() - offset_)); tooLow_ = jit_->J_CC(CC_L); jit_->CMP(32, R(xaddr_), Imm32(PSP_GetUserMemoryEnd() - offset_)); tooHigh_ = jit_->J_CC(CC_GE); // We may need to jump back up here. safe_ = jit_->GetCodePtr(); } else { #ifdef _M_IX86 // Need to modify it, too bad. if (xaddr_ != EAX) jit_->MOV(32, R(EAX), R(xaddr_)); jit_->AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK)); xaddr_ = EAX; #endif } #ifdef _M_IX86 return MDisp(xaddr_, (u32) Memory::base + offset_); #else return MComplex(RBX, xaddr_, SCALE_1, offset_); #endif }
OpArg Jit::JitSafeMem::NextFastAddress(int suboffset) { if (jit_->gpr.IsImmediate(raddr_)) { u32 addr = jit_->gpr.GetImmediate32(raddr_) + offset_ + suboffset; #ifdef _M_IX86 return M(Memory::base + (addr & Memory::MEMVIEW32_MASK)); #else return MDisp(RBX, addr); #endif } #ifdef _M_IX86 return MDisp(xaddr_, (u32) Memory::base + offset_ + suboffset); #else return MComplex(RBX, xaddr_, SCALE_1, offset_ + suboffset); #endif }
std::shared_ptr<ComPWA::FunctionTree> FormFactorDecorator::createFunctionTree(const ParameterList &DataSample, unsigned int pos, const std::string &suffix) const { // size_t sampleSize = DataSample.mDoubleValue(pos)->values().size(); size_t sampleSize = DataSample.mDoubleValue(0)->values().size(); std::string NodeName = "BreitWignerWithProductionFormFactor(" + Name + ")" + suffix; auto tr = std::make_shared<FunctionTree>(NodeName, MComplex("", sampleSize), std::make_shared<MultAll>(ParType::MCOMPLEX)); std::string ffNodeName = "ProductionFormFactor(" + Name + ")" + suffix; auto ffTree = std::make_shared<FunctionTree>(ffNodeName, MDouble("", sampleSize), std::make_shared<FormFactorStrategy>()); // add L and FFType as double value leaf, since there is no int leaf ffTree->createLeaf("OrbitalAngularMomentum", (double ) L, ffNodeName); ffTree->createLeaf("MesonRadius", MesonRadius, ffNodeName); ffTree->createLeaf("FormFactorType", (double) FFType, ffNodeName); ffTree->createLeaf("MassA", Daughter1Mass, ffNodeName); ffTree->createLeaf("MassB", Daughter2Mass, ffNodeName); ffTree->createLeaf("Data_mSq[" + std::to_string(pos) + "]", DataSample.mDoubleValue(pos), ffNodeName); ffTree->parameter(); tr->insertTree(ffTree, NodeName); std::shared_ptr<ComPWA::FunctionTree> breitWignerTree = UndecoratedBreitWigner->createFunctionTree(DataSample, pos, suffix); breitWignerTree->parameter(); tr->insertTree(breitWignerTree, NodeName); if (!tr->sanityCheck()) throw std::runtime_error( "FormFactorDecorator::createFunctionTree() | " "Tree didn't pass sanity check!"); return tr; };
OpArg Jit::JitSafeMem::NextFastAddress(int suboffset) { if (jit_->gpr.IsImm(raddr_)) { u32 addr = (jit_->gpr.GetImm(raddr_) + offset_ + suboffset) & alignMask_; #ifdef _M_IX86 return M(Memory::base + (addr & Memory::MEMVIEW32_MASK)); #else return MDisp(RBX, addr); #endif } _dbg_assert_msg_(JIT, (suboffset & alignMask_) == suboffset, "suboffset must be aligned"); #ifdef _M_IX86 return MDisp(xaddr_, (u32) Memory::base + offset_ + suboffset); #else return MComplex(RBX, xaddr_, SCALE_1, offset_ + suboffset); #endif }
TEST_F(x64EmitterTest, PUSH_MComplex) { emitter->PUSH(64, MComplex(RAX, RBX, SCALE_2, 4)); ExpectDisassembly("push qword ptr ds:[rax+rbx*2+4]"); }
bool SamplerJitCache::Jit_GetTexDataSwizzled(const SamplerID &id, int bitsPerTexel) { if (bitsPerTexel == 4) { // Specialized implementation. return Jit_GetTexDataSwizzled4(); } LEA(32, tempReg1, MScaled(vReg, SCALE_4, 0)); AND(32, R(tempReg1), Imm8(31)); AND(32, R(vReg), Imm8(~7)); MOV(32, R(tempReg2), R(uReg)); MOV(32, R(resultReg), R(uReg)); switch (bitsPerTexel) { case 32: SHR(32, R(resultReg), Imm8(2)); break; case 16: SHR(32, R(vReg), Imm8(1)); SHR(32, R(tempReg2), Imm8(1)); SHR(32, R(resultReg), Imm8(3)); break; case 8: SHR(32, R(vReg), Imm8(2)); SHR(32, R(tempReg2), Imm8(2)); SHR(32, R(resultReg), Imm8(4)); break; default: return false; } AND(32, R(tempReg2), Imm8(3)); SHL(32, R(resultReg), Imm8(5)); ADD(32, R(tempReg1), R(tempReg2)); ADD(32, R(tempReg1), R(resultReg)); // We may clobber srcReg in the MUL, so let's grab it now. LEA(64, tempReg1, MComplex(srcReg, tempReg1, SCALE_4, 0)); LEA(32, EAX, MScaled(bufwReg, SCALE_4, 0)); MUL(32, R(vReg)); switch (bitsPerTexel) { case 32: MOV(bitsPerTexel, R(resultReg), MRegSum(tempReg1, EAX)); break; case 16: AND(32, R(uReg), Imm8(1)); // Multiply by two by just adding twice. ADD(32, R(EAX), R(uReg)); ADD(32, R(EAX), R(uReg)); MOVZX(32, bitsPerTexel, resultReg, MRegSum(tempReg1, EAX)); break; case 8: AND(32, R(uReg), Imm8(3)); ADD(32, R(EAX), R(uReg)); MOVZX(32, bitsPerTexel, resultReg, MRegSum(tempReg1, EAX)); break; default: return false; } return true; }
void Jit64::lfd(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStoreFloatingOff); FALLBACK_IF(js.memcheck || !inst.RA); int d = inst.RD; int a = inst.RA; s32 offset = (s32)(s16)inst.SIMM_16; gpr.FlushLockX(ABI_PARAM1); gpr.Lock(a); MOV(32, R(ABI_PARAM1), gpr.R(a)); // TODO - optimize. This has to load the previous value - upper double should stay unmodified. fpr.Lock(d); fpr.BindToRegister(d, true); X64Reg xd = fpr.RX(d); if (cpu_info.bSSSE3) { #if _M_X86_64 MOVQ_xmm(XMM0, MComplex(RBX, ABI_PARAM1, SCALE_1, offset)); #else AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK)); MOVQ_xmm(XMM0, MDisp(ABI_PARAM1, (u32)Memory::base + offset)); #endif PSHUFB(XMM0, M((void *)bswapShuffle1x8Dupe)); MOVSD(xd, R(XMM0)); } else { #if _M_X86_64 LoadAndSwap(64, EAX, MComplex(RBX, ABI_PARAM1, SCALE_1, offset)); MOV(64, M(&temp64), R(EAX)); MEMCHECK_START MOVSD(XMM0, M(&temp64)); MOVSD(xd, R(XMM0)); MEMCHECK_END #else AND(32, R(ABI_PARAM1), Imm32(Memory::MEMVIEW32_MASK)); MOV(32, R(EAX), MDisp(ABI_PARAM1, (u32)Memory::base + offset)); BSWAP(32, EAX); MOV(32, M((void*)((u8 *)&temp64+4)), R(EAX)); MEMCHECK_START MOV(32, R(EAX), MDisp(ABI_PARAM1, (u32)Memory::base + offset + 4)); BSWAP(32, EAX); MOV(32, M(&temp64), R(EAX)); MOVSD(XMM0, M(&temp64)); MOVSD(xd, R(XMM0)); MEMCHECK_END #endif } gpr.UnlockAll(); gpr.UnlockAllX(); fpr.UnlockAll(); }
void Jit64::stfd(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITLoadStoreFloatingOff); FALLBACK_IF(js.memcheck || !inst.RA); int s = inst.RS; int a = inst.RA; u32 mem_mask = Memory::ADDR_MASK_HW_ACCESS; if (Core::g_CoreStartupParameter.bMMU || Core::g_CoreStartupParameter.bTLBHack) { mem_mask |= Memory::ADDR_MASK_MEM1; } #ifdef ENABLE_MEM_CHECK if (Core::g_CoreStartupParameter.bEnableDebugging) { mem_mask |= Memory::EXRAM_MASK; } #endif gpr.FlushLockX(ABI_PARAM1); gpr.Lock(a); fpr.Lock(s); gpr.BindToRegister(a, true, false); s32 offset = (s32)(s16)inst.SIMM_16; LEA(32, ABI_PARAM1, MDisp(gpr.R(a).GetSimpleReg(), offset)); TEST(32, R(ABI_PARAM1), Imm32(mem_mask)); FixupBranch safe = J_CC(CC_NZ); // Fast routine if (cpu_info.bSSSE3) { MOVAPD(XMM0, fpr.R(s)); PSHUFB(XMM0, M((void*)bswapShuffle1x8)); #if _M_X86_64 MOVQ_xmm(MComplex(RBX, ABI_PARAM1, SCALE_1, 0), XMM0); #else AND(32, R(ECX), Imm32(Memory::MEMVIEW32_MASK)); MOVQ_xmm(MDisp(ABI_PARAM1, (u32)Memory::base), XMM0); #endif } else { MOVAPD(XMM0, fpr.R(s)); MOVD_xmm(R(EAX), XMM0); UnsafeWriteRegToReg(EAX, ABI_PARAM1, 32, 4); PSRLQ(XMM0, 32); MOVD_xmm(R(EAX), XMM0); UnsafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0); } FixupBranch exit = J(true); SetJumpTarget(safe); // Safe but slow routine MOVAPD(XMM0, fpr.R(s)); PSRLQ(XMM0, 32); MOVD_xmm(R(EAX), XMM0); SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 0, RegistersInUse() | (1 << (16 + XMM0))); MOVAPD(XMM0, fpr.R(s)); MOVD_xmm(R(EAX), XMM0); LEA(32, ABI_PARAM1, MDisp(gpr.R(a).GetSimpleReg(), offset)); SafeWriteRegToReg(EAX, ABI_PARAM1, 32, 4, RegistersInUse()); SetJumpTarget(exit); gpr.UnlockAll(); gpr.UnlockAllX(); fpr.UnlockAll(); }