//TODO: add optimized cases void Jit64::ps_maddXX(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITPairedOff); FALLBACK_IF(inst.Rc); int a = inst.FA; int b = inst.FB; int c = inst.FC; int d = inst.FD; fpr.Lock(a,b,c,d); MOVAPD(XMM0, fpr.R(a)); switch (inst.SUBOP5) { case 14: //madds0 MOVDDUP(XMM1, fpr.R(c)); MULPD(XMM0, R(XMM1)); ADDPD(XMM0, fpr.R(b)); break; case 15: //madds1 MOVAPD(XMM1, fpr.R(c)); SHUFPD(XMM1, R(XMM1), 3); // copy higher to lower MULPD(XMM0, R(XMM1)); ADDPD(XMM0, fpr.R(b)); break; case 28: //msub MULPD(XMM0, fpr.R(c)); SUBPD(XMM0, fpr.R(b)); break; case 29: //madd MULPD(XMM0, fpr.R(c)); ADDPD(XMM0, fpr.R(b)); break; case 30: //nmsub MULPD(XMM0, fpr.R(c)); SUBPD(XMM0, fpr.R(b)); PXOR(XMM0, M((void*)&psSignBits)); break; case 31: //nmadd MULPD(XMM0, fpr.R(c)); ADDPD(XMM0, fpr.R(b)); PXOR(XMM0, M((void*)&psSignBits)); break; default: _assert_msg_(DYNA_REC, 0, "ps_maddXX WTF!!!"); //FallBackToInterpreter(inst); //fpr.UnlockAll(); return; } fpr.BindToRegister(d, false); MOVAPD(fpr.RX(d), Gen::R(XMM0)); ForceSinglePrecisionP(fpr.RX(d)); fpr.UnlockAll(); }
void Jit64::fmaddXX(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITFloatingPointOff); FALLBACK_IF(inst.Rc); // Only the interpreter has "proper" support for (some) FP flags FALLBACK_IF(inst.SUBOP5 == 29 && Core::g_CoreStartupParameter.bEnableFPRF); bool single_precision = inst.OPCD == 59; int a = inst.FA; int b = inst.FB; int c = inst.FC; int d = inst.FD; fpr.Lock(a, b, c, d); MOVSD(XMM0, fpr.R(a)); switch (inst.SUBOP5) { case 28: //msub MULSD(XMM0, fpr.R(c)); SUBSD(XMM0, fpr.R(b)); break; case 29: //madd MULSD(XMM0, fpr.R(c)); ADDSD(XMM0, fpr.R(b)); break; case 30: //nmsub MULSD(XMM0, fpr.R(c)); SUBSD(XMM0, fpr.R(b)); PXOR(XMM0, M((void*)&psSignBits2)); break; case 31: //nmadd MULSD(XMM0, fpr.R(c)); ADDSD(XMM0, fpr.R(b)); PXOR(XMM0, M((void*)&psSignBits2)); break; } fpr.BindToRegister(d, false); //YES it is necessary to dupe the result :( //TODO : analysis - does the top reg get used? If so, dupe, if not, don't. if (single_precision) { ForceSinglePrecisionS(XMM0); MOVDDUP(fpr.RX(d), R(XMM0)); } else { MOVSD(fpr.RX(d), R(XMM0)); } // SMB checks flags after this op. Let's lie. //AND(32, M(&PowerPC::ppcState.fpscr), Imm32(~((0x80000000 >> 19) | (0x80000000 >> 15)))); //OR(32, M(&PowerPC::ppcState.fpscr), Imm32((0x80000000 >> 16))); fpr.UnlockAll(); }
void Jit64::ps_sign(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITPairedOff); FALLBACK_IF(inst.Rc); int d = inst.FD; int b = inst.FB; fpr.Lock(d, b); if (d != b) { fpr.BindToRegister(d, false); MOVAPD(fpr.RX(d), fpr.R(b)); } else { fpr.BindToRegister(d, true); } switch (inst.SUBOP10) { case 40: //neg PXOR(fpr.RX(d), M((void*)&psSignBits)); break; case 136: //nabs POR(fpr.RX(d), M((void*)&psSignBits)); break; case 264: //abs PAND(fpr.RX(d), M((void*)&psAbsMask)); break; } fpr.UnlockAll(); }
void Jit64::ps_sel(UGeckoInstruction inst) { // we can't use (V)BLENDVPD here because it just looks at the sign bit // but we need -0 = +0 INSTRUCTION_START JITDISABLE(bJITPairedOff); FALLBACK_IF(inst.Rc); int d = inst.FD; int a = inst.FA; int b = inst.FB; int c = inst.FC; fpr.Lock(a, b, c, d); MOVAPD(XMM0, fpr.R(a)); PXOR(XMM1, R(XMM1)); // XMM0 = XMM0 < 0 ? all 1s : all 0s CMPPD(XMM0, R(XMM1), LT); MOVAPD(XMM1, R(XMM0)); PAND(XMM0, fpr.R(b)); PANDN(XMM1, fpr.R(c)); POR(XMM0, R(XMM1)); fpr.BindToRegister(d, false); MOVAPD(fpr.RX(d), R(XMM0)); fpr.UnlockAll(); }
void Jit64::fsign(UGeckoInstruction inst) { INSTRUCTION_START JITDISABLE(bJITFloatingPointOff); FALLBACK_IF(inst.Rc); int d = inst.FD; int b = inst.FB; fpr.Lock(b, d); fpr.BindToRegister(d, true, true); MOVSD(XMM0, fpr.R(b)); switch (inst.SUBOP10) { case 40: // fnegx PXOR(XMM0, M((void*)&psSignBits2)); break; case 264: // fabsx PAND(XMM0, M((void*)&psAbsMask2)); break; case 136: // fnabs POR(XMM0, M((void*)&psSignBits2)); break; default: PanicAlert("fsign bleh"); break; } MOVSD(fpr.R(d), XMM0); fpr.UnlockAll(); }
// Zero cache line. void JitILBase::dcbz(UGeckoInstruction inst) { FALLBACK_IF(true); // TODO! #if 0 if (Core::g_CoreStartupParameter.bJITOff || Core::g_CoreStartupParameter.bJITLoadStoreOff) { Default(inst); return; } INSTRUCTION_START; MOV(32, R(EAX), gpr.R(inst.RB)); if (inst.RA) ADD(32, R(EAX), gpr.R(inst.RA)); AND(32, R(EAX), Imm32(~31)); PXOR(XMM0, R(XMM0)); #if _M_X86_64 MOVAPS(MComplex(EBX, EAX, SCALE_1, 0), XMM0); MOVAPS(MComplex(EBX, EAX, SCALE_1, 16), XMM0); #else AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK)); MOVAPS(MDisp(EAX, (u32)Memory::base), XMM0); MOVAPS(MDisp(EAX, (u32)Memory::base + 16), XMM0); #endif #endif }
// Zero cache line. void JitILBase::dcbz(UGeckoInstruction inst) { FALLBACK_IF(true); // TODO! #if 0 if (SConfig::GetInstance().bJITOff || SConfig::GetInstance().bJITLoadStoreOff) { Default(inst); return; } INSTRUCTION_START; MOV(32, R(RSCRATCH), gpr.R(inst.RB)); if (inst.RA) ADD(32, R(RSCRATCH), gpr.R(inst.RA)); AND(32, R(RSCRATCH), Imm32(~31)); PXOR(XMM0, R(XMM0)); MOVAPS(MComplex(RMEM, RSCRATCH, SCALE_1, 0), XMM0); MOVAPS(MComplex(RMEM, RSCRATCH, SCALE_1, 16), XMM0); #endif }
void Jit::Comp_FPU2op(MIPSOpcode op) { CONDITIONAL_DISABLE; int fs = _FS; int fd = _FD; switch (op & 0x3f) { case 5: //F(fd) = fabsf(F(fs)); break; //abs fpr.SpillLock(fd, fs); fpr.MapReg(fd, fd == fs, true); MOVSS(fpr.RX(fd), fpr.R(fs)); PAND(fpr.RX(fd), M(ssNoSignMask)); break; case 6: //F(fd) = F(fs); break; //mov if (fd != fs) { fpr.SpillLock(fd, fs); fpr.MapReg(fd, fd == fs, true); MOVSS(fpr.RX(fd), fpr.R(fs)); } break; case 7: //F(fd) = -F(fs); break; //neg fpr.SpillLock(fd, fs); fpr.MapReg(fd, fd == fs, true); MOVSS(fpr.RX(fd), fpr.R(fs)); PXOR(fpr.RX(fd), M(ssSignBits2)); break; case 4: //F(fd) = sqrtf(F(fs)); break; //sqrt fpr.SpillLock(fd, fs); // this probably works, just badly tested fpr.MapReg(fd, fd == fs, true); SQRTSS(fpr.RX(fd), fpr.R(fs)); break; case 13: //FsI(fd) = F(fs)>=0 ? (int)floorf(F(fs)) : (int)ceilf(F(fs)); break;//trunc.w.s { fpr.SpillLock(fs, fd); fpr.StoreFromRegister(fd); CVTTSS2SI(EAX, fpr.R(fs)); // Did we get an indefinite integer value? CMP(32, R(EAX), Imm32(0x80000000)); FixupBranch skip = J_CC(CC_NE); MOVSS(XMM0, fpr.R(fs)); XORPS(XMM1, R(XMM1)); CMPSS(XMM0, R(XMM1), CMP_LT); // At this point, -inf = 0xffffffff, inf/nan = 0x00000000. // We want -inf to be 0x80000000 inf/nan to be 0x7fffffff, so we flip those bits. MOVD_xmm(R(EAX), XMM0); XOR(32, R(EAX), Imm32(0x7fffffff)); SetJumpTarget(skip); MOV(32, fpr.R(fd), R(EAX)); } break; case 32: //F(fd) = (float)FsI(fs); break; //cvt.s.w // Store to memory so we can read it as an integer value. fpr.StoreFromRegister(fs); CVTSI2SS(XMM0, fpr.R(fs)); MOVSS(fpr.R(fd), XMM0); break; case 12: //FsI(fd) = (int)floorf(F(fs)+0.5f); break; //round.w.s case 14: //FsI(fd) = (int)ceilf (F(fs)); break; //ceil.w.s case 15: //FsI(fd) = (int)floorf(F(fs)); break; //floor.w.s case 36: //FsI(fd) = (int) F(fs); break; //cvt.w.s default: DISABLE; return; } fpr.ReleaseSpillLocks(); }
void Jit::Comp_FPU2op(u32 op) { CONDITIONAL_DISABLE; int fs = _FS; int fd = _FD; switch (op & 0x3f) { case 5: //F(fd) = fabsf(F(fs)); break; //abs fpr.Lock(fd, fs); fpr.BindToRegister(fd, fd == fs, true); MOVSS(fpr.RX(fd), fpr.R(fs)); PAND(fpr.RX(fd), M((void *)ssNoSignMask)); fpr.UnlockAll(); break; case 6: //F(fd) = F(fs); break; //mov if (fd != fs) { fpr.Lock(fd, fs); fpr.BindToRegister(fd, fd == fs, true); MOVSS(fpr.RX(fd), fpr.R(fs)); fpr.UnlockAll(); } break; case 7: //F(fd) = -F(fs); break; //neg fpr.Lock(fd, fs); fpr.BindToRegister(fd, fd == fs, true); MOVSS(fpr.RX(fd), fpr.R(fs)); PXOR(fpr.RX(fd), M((void *)ssSignBits2)); fpr.UnlockAll(); break; case 12: //FsI(fd) = (int)floorf(F(fs)+0.5f); break; //round.w.s case 4: //F(fd) = sqrtf(F(fs)); break; //sqrt /* fpr.Lock(fd, fs); // this probably works, just badly tested fpr.BindToRegister(fd, fd == fs, true); SQRTSS(fpr.RX(fd), fpr.R(fs)); fpr.UnlockAll(); break;*/ Comp_Generic(op); return; case 13: //FsI(fd) = F(fs)>=0 ? (int)floorf(F(fs)) : (int)ceilf(F(fs)); break;//trunc.w.s fpr.Lock(fs, fd); fpr.StoreFromRegister(fd); CVTTSS2SI(EAX, fpr.R(fs)); MOV(32, fpr.R(fd), R(EAX)); fpr.UnlockAll(); break; case 14: //FsI(fd) = (int)ceilf (F(fs)); break; //ceil.w.s case 15: //FsI(fd) = (int)floorf(F(fs)); break; //floor.w.s case 32: //F(fd) = (float)FsI(fs); break; //cvt.s.w case 36: //FsI(fd) = (int) F(fs); break; //cvt.w.s default: Comp_Generic(op); return; } }
LinearFunc SamplerJitCache::CompileLinear(const SamplerID &id) { _assert_msg_(G3D, id.linear, "Linear should be set on sampler id"); BeginWrite(); // We'll first write the nearest sampler, which we will CALL. // This may differ slightly based on the "linear" flag. const u8 *nearest = AlignCode16(); if (!Jit_ReadTextureFormat(id)) { EndWrite(); SetCodePtr(const_cast<u8 *>(nearest)); return nullptr; } RET(); // Now the actual linear func, which is exposed externally. const u8 *start = AlignCode16(); // NOTE: This doesn't use the general register mapping. // POSIX: arg1=uptr, arg2=vptr, arg3=frac_u, arg4=frac_v, arg5=src, arg6=bufw, stack+8=level // Win64: arg1=uptr, arg2=vptr, arg3=frac_u, arg4=frac_v, stack+40=src, stack+48=bufw, stack+56=level // // We map these to nearest CALLs, with order: u, v, src, bufw, level // Let's start by saving a bunch of registers. PUSH(R15); PUSH(R14); PUSH(R13); PUSH(R12); // Won't need frac_u/frac_v for a while. PUSH(arg4Reg); PUSH(arg3Reg); // Extra space to restore alignment and save resultReg for lerp. // TODO: Maybe use XMMs instead? SUB(64, R(RSP), Imm8(24)); MOV(64, R(R12), R(arg1Reg)); MOV(64, R(R13), R(arg2Reg)); #ifdef _WIN32 // First arg now starts at 24 (extra space) + 48 (pushed stack) + 8 (ret address) + 32 (shadow space) const int argOffset = 24 + 48 + 8 + 32; MOV(64, R(R14), MDisp(RSP, argOffset)); MOV(32, R(R15), MDisp(RSP, argOffset + 8)); // level is at argOffset + 16. #else MOV(64, R(R14), R(arg5Reg)); MOV(32, R(R15), R(arg6Reg)); // level is at 24 + 48 + 8. #endif // Early exit on !srcPtr. FixupBranch zeroSrc; if (id.hasInvalidPtr) { CMP(PTRBITS, R(R14), Imm8(0)); FixupBranch nonZeroSrc = J_CC(CC_NZ); XOR(32, R(RAX), R(RAX)); zeroSrc = J(true); SetJumpTarget(nonZeroSrc); } // At this point: // R12=uptr, R13=vptr, stack+24=frac_u, stack+32=frac_v, R14=src, R15=bufw, stack+X=level auto doNearestCall = [&](int off) { MOV(32, R(uReg), MDisp(R12, off)); MOV(32, R(vReg), MDisp(R13, off)); MOV(64, R(srcReg), R(R14)); MOV(32, R(bufwReg), R(R15)); // Leave level, we just always load from RAM. Separate CLUTs is uncommon. CALL(nearest); MOV(32, MDisp(RSP, off), R(resultReg)); }; doNearestCall(0); doNearestCall(4); doNearestCall(8); doNearestCall(12); // Convert TL, TR, BL, BR to floats for easier blending. if (!cpu_info.bSSE4_1) { PXOR(XMM0, R(XMM0)); } MOVD_xmm(fpScratchReg1, MDisp(RSP, 0)); MOVD_xmm(fpScratchReg2, MDisp(RSP, 4)); MOVD_xmm(fpScratchReg3, MDisp(RSP, 8)); MOVD_xmm(fpScratchReg4, MDisp(RSP, 12)); if (cpu_info.bSSE4_1) { PMOVZXBD(fpScratchReg1, R(fpScratchReg1)); PMOVZXBD(fpScratchReg2, R(fpScratchReg2)); PMOVZXBD(fpScratchReg3, R(fpScratchReg3)); PMOVZXBD(fpScratchReg4, R(fpScratchReg4)); } else { PUNPCKLBW(fpScratchReg1, R(XMM0)); PUNPCKLBW(fpScratchReg2, R(XMM0)); PUNPCKLBW(fpScratchReg3, R(XMM0)); PUNPCKLBW(fpScratchReg4, R(XMM0)); PUNPCKLWD(fpScratchReg1, R(XMM0)); PUNPCKLWD(fpScratchReg2, R(XMM0)); PUNPCKLWD(fpScratchReg3, R(XMM0)); PUNPCKLWD(fpScratchReg4, R(XMM0)); } CVTDQ2PS(fpScratchReg1, R(fpScratchReg1)); CVTDQ2PS(fpScratchReg2, R(fpScratchReg2)); CVTDQ2PS(fpScratchReg3, R(fpScratchReg3)); CVTDQ2PS(fpScratchReg4, R(fpScratchReg4)); // Okay, now multiply the R sides by frac_u, and L by (256 - frac_u)... MOVD_xmm(fpScratchReg5, MDisp(RSP, 24)); CVTDQ2PS(fpScratchReg5, R(fpScratchReg5)); SHUFPS(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0)); if (RipAccessible(by256)) { MULPS(fpScratchReg5, M(by256)); // rip accessible } else { Crash(); // TODO } MOVAPS(XMM0, M(ones)); SUBPS(XMM0, R(fpScratchReg5)); MULPS(fpScratchReg1, R(XMM0)); MULPS(fpScratchReg2, R(fpScratchReg5)); MULPS(fpScratchReg3, R(XMM0)); MULPS(fpScratchReg4, R(fpScratchReg5)); // Now set top=fpScratchReg1, bottom=fpScratchReg3. ADDPS(fpScratchReg1, R(fpScratchReg2)); ADDPS(fpScratchReg3, R(fpScratchReg4)); // Next, time for frac_v. MOVD_xmm(fpScratchReg5, MDisp(RSP, 32)); CVTDQ2PS(fpScratchReg5, R(fpScratchReg5)); SHUFPS(fpScratchReg5, R(fpScratchReg5), _MM_SHUFFLE(0, 0, 0, 0)); MULPS(fpScratchReg5, M(by256)); MOVAPS(XMM0, M(ones)); SUBPS(XMM0, R(fpScratchReg5)); MULPS(fpScratchReg1, R(XMM0)); MULPS(fpScratchReg3, R(fpScratchReg5)); // Still at the 255 scale, now we're interpolated. ADDPS(fpScratchReg1, R(fpScratchReg3)); // Time to convert back to a single 32 bit value. CVTPS2DQ(fpScratchReg1, R(fpScratchReg1)); PACKSSDW(fpScratchReg1, R(fpScratchReg1)); PACKUSWB(fpScratchReg1, R(fpScratchReg1)); MOVD_xmm(R(resultReg), fpScratchReg1); if (id.hasInvalidPtr) { SetJumpTarget(zeroSrc); } ADD(64, R(RSP), Imm8(24)); POP(arg3Reg); POP(arg4Reg); POP(R12); POP(R13); POP(R14); POP(R15); RET(); EndWrite(); return (LinearFunc)start; }