bool FPURegCache::IsMappedVS(const u8 *v, VectorSize vsz) { const int n = GetNumVectorElements(vsz); // Make sure the first reg is at least mapped in the right place. if (!IsMappedVS(v[0])) return false; if (vregs[v[0]].lane != 1) return false; // And make sure the rest are mapped to the same reg in the right positions. X64Reg xr = VSX(v); for (int i = 1; i < n; ++i) { u8 vi = v[i]; if (!IsMappedVS(vi) || VSX(&vi) != xr) return false; if (vregs[vi].lane != i + 1) return false; } // TODO: Optimize this case? It happens. for (int i = n; i < 4; ++i) { if (xregs[xr].mipsRegs[i] != -1) { return false; } } return true; }
void FPURegCache::SimpleRegsV(const u8 *v, VectorSize vsz, int flags) { const int n = GetNumVectorElements(vsz); // TODO: Could be more optimal (in case of Discard or etc.) for (int i = 0; i < n; ++i) { SimpleRegV(v[i], flags); } }
void ApplyPrefixD(float *v, VectorSize size, bool onlyWriteMask = false) { u32 data = currentMIPS->vfpuCtrl[VFPU_CTRL_DPREFIX]; if (!data) return; int n = GetNumVectorElements(size); bool writeMask[4]; for (int i = 0; i < n; i++) { int mask = (data >> (8 + i)) & 1; writeMask[i] = mask ? true : false; if (!onlyWriteMask) { int sat = (data >> (i * 2)) & 3; if (sat == 1) { if (v[i] > 1.0f) v[i] = 1.0f; if (v[i] < 0.0f) v[i] = 0.0f; } else if (sat == 3) { if (v[i] > 1.0f) v[i] = 1.0f; if (v[i] < -1.0f) v[i] = -1.0f; } } }
void Jit::ApplyPrefixD(const u8 *vregs, VectorSize sz) { _assert_(js.prefixDFlag & ArmJitState::PREFIX_KNOWN); if (!js.prefixD) return; int n = GetNumVectorElements(sz); for (int i = 0; i < n; i++) { if (js.VfpuWriteMask(i)) continue; int sat = (js.prefixD >> (i * 2)) & 3; if (sat == 1) { // clamped = fabs(x) - fabs(x-0.5f) + 0.5f; // [ 0, 1] fpr.MapRegV(vregs[i], MAP_DIRTY); MOVI2F(S0, 0.5, R0); VABS(S1, fpr.V(vregs[i])); // S1 = fabs(x) VSUB(S2, fpr.V(vregs[i]), S0); // S2 = fabs(x-0.5f) {VABD} VABS(S2, S2); VSUB(fpr.V(vregs[i]), S1, S2); // v[i] = S1 - S2 + 0.5f VADD(fpr.V(vregs[i]), fpr.V(vregs[i]), S0); } else if (sat == 3) { // clamped = fabs(x) - fabs(x-1.0f); // [-1, 1] fpr.MapRegV(vregs[i], MAP_DIRTY); MOVI2F(S0, 1.0, R0); VABS(S1, fpr.V(vregs[i])); // S1 = fabs(x) VSUB(S2, fpr.V(vregs[i]), S0); // S2 = fabs(x-1.0f) {VABD} VABS(S2, S2); VSUB(fpr.V(vregs[i]), S1, S2); // v[i] = S1 - S2 } } }
bool FPURegCache::TryMapRegsVS(const u8 *v, VectorSize vsz, int flags) { const int n = GetNumVectorElements(vsz); if (!CanMapVS(v, vsz)) { return false; } if (IsMappedVS(v, vsz)) { // Already mapped then, perfect. Just mark dirty. if ((flags & MAP_DIRTY) != 0) xregs[VSX(v)].dirty = true; return true; } // At this point, some or all are in single regs or memory, and they're not locked there. if (n == 1) { // Single is easy, just map normally but track as a SIMD reg. // This way V/VS can warn about improper usage properly. MapRegV(v[0], flags); vregs[v[0]].lane = 1; if ((flags & MAP_DIRTY) != 0) xregs[VSX(v)].dirty = true; Invariant(); return true; } X64Reg xr; if ((flags & MAP_NOINIT) != MAP_NOINIT) { xr = LoadRegsVS(v, n); } else { xr = GetFreeXReg(); } // Victory, now let's clean up everything. OpArg newloc = Gen::R(xr); bool dirty = (flags & MAP_DIRTY) != 0; for (int i = 0; i < n; ++i) { MIPSCachedFPReg &vr = vregs[v[i]]; if (vr.away) { // Clear the xreg it was in before. X64Reg oldXReg = vr.location.GetSimpleReg(); xregs[oldXReg].mipsReg = -1; if (xregs[oldXReg].dirty) { // Inherit the "dirtiness" (ultimately set below for all regs.) dirty = true; xregs[oldXReg].dirty = false; } } xregs[xr].mipsRegs[i] = v[i] + 32; vr.location = newloc; vr.lane = i + 1; vr.away = true; } xregs[xr].dirty = dirty; Invariant(); return true; }
void FPURegCache::MapRegsV(int vec, VectorSize sz, int flags) { u8 r[4]; GetVectorRegs(r, sz, vec); SpillLockV(r, sz); for (int i = 0; i < GetNumVectorElements(sz); i++) { MapReg(r[i] + 32, (flags & MAP_NOINIT) != MAP_NOINIT, (flags & MAP_DIRTY) != 0); } }
void FPURegCache::MapRegsV(int vec, VectorSize sz, int flags) { u8 v[4]; GetVectorRegs(v, sz, vec); SpillLockV(v, sz); for (int i = 0; i < GetNumVectorElements(sz); i++) { BindToRegister(v[i] + 32, (flags & MAP_NOINIT) == 0, (flags & MAP_DIRTY) != 0); } }
void Jit::ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz) { if (prefix == 0xE4) return; int n = GetNumVectorElements(sz); u8 origV[4]; static const float constantArray[8] = {0.f, 1.f, 2.f, 0.5f, 3.f, 1.f/3.f, 0.25f, 1.f/6.f}; for (int i = 0; i < n; i++) origV[i] = vregs[i]; for (int i = 0; i < n; i++) { int regnum = (prefix >> (i*2)) & 3; int abs = (prefix >> (8+i)) & 1; int negate = (prefix >> (16+i)) & 1; int constants = (prefix >> (12+i)) & 1; // Unchanged, hurray. if (!constants && regnum == i && !abs && !negate) continue; // This puts the value into a temp reg, so we won't write the modified value back. vregs[i] = fpr.GetTempV(); fpr.MapRegV(vregs[i], MAP_NOINIT | MAP_DIRTY); if (!constants) { // Prefix may say "z, z, z, z" but if this is a pair, we force to x. // TODO: But some ops seem to use const 0 instead? if (regnum >= n) { ERROR_LOG_REPORT(CPU, "Invalid VFPU swizzle: %08x / %d", prefix, sz); regnum = 0; } if (abs) { VABS(fpr.V(vregs[i]), fpr.V(origV[regnum])); } else { VMOV(fpr.V(vregs[i]), fpr.V(origV[regnum])); } } else { // TODO: There is VMOV s, imm on ARM, that can generate some of these constants. Not 1/3 or 1/6 though. MOVI2F(fpr.V(vregs[i]), constantArray[regnum + (abs<<2)], R0); } // TODO: This can be integrated into the VABS / VMOV above, and also the constants. if (negate) VNEG(fpr.V(vregs[i]), fpr.V(vregs[i])); // TODO: This probably means it will swap out soon, inefficiently... fpr.ReleaseSpillLockV(vregs[i]); } }
void Jit::ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz) { if (prefix == 0xE4) return; int n = GetNumVectorElements(sz); u8 origV[4]; static const float constantArray[8] = {0.f, 1.f, 2.f, 0.5f, 3.f, 1.f/3.f, 0.25f, 1.f/6.f}; for (int i = 0; i < n; i++) origV[i] = vregs[i]; for (int i = 0; i < n; i++) { int regnum = (prefix >> (i*2)) & 3; int abs = (prefix >> (8+i)) & 1; int negate = (prefix >> (16+i)) & 1; int constants = (prefix >> (12+i)) & 1; // Unchanged, hurray. if (!constants && regnum == i && !abs && !negate) continue; // This puts the value into a temp reg, so we won't write the modified value back. vregs[i] = fpr.GetTempV(); if (!constants) { fpr.MapDirtyInV(vregs[i], origV[regnum]); fpr.SpillLockV(vregs[i]); // Prefix may say "z, z, z, z" but if this is a pair, we force to x. // TODO: But some ops seem to use const 0 instead? if (regnum >= n) { WARN_LOG(CPU, "JIT: Invalid VFPU swizzle: %08x : %d / %d at PC = %08x (%s)", prefix, regnum, n, js.compilerPC, currentMIPS->DisasmAt(js.compilerPC)); regnum = 0; } if (abs) { VABS(fpr.V(vregs[i]), fpr.V(origV[regnum])); if (negate) VNEG(fpr.V(vregs[i]), fpr.V(vregs[i])); } else { if (negate) VNEG(fpr.V(vregs[i]), fpr.V(origV[regnum])); else VMOV(fpr.V(vregs[i]), fpr.V(origV[regnum])); } } else { fpr.MapRegV(vregs[i], MAP_DIRTY | MAP_NOINIT); fpr.SpillLockV(vregs[i]); MOVI2F(fpr.V(vregs[i]), constantArray[regnum + (abs<<2)], R0, negate); } } }
void Jit::GetVectorRegsPrefixD(u8 *regs, VectorSize sz, int vectorReg) { _assert_(js.prefixDFlag & ArmJitState::PREFIX_KNOWN); GetVectorRegs(regs, sz, vectorReg); if (js.prefixD == 0) return; int n = GetNumVectorElements(sz); for (int i = 0; i < n; i++) { // Hopefully this is rare, we'll just write it into a reg we drop. if (js.VfpuWriteMask(i)) regs[i] = fpr.GetTempV(); } }
void FPURegCache::MapRegsVS(const u8 *r, VectorSize vsz, int flags) { const int n = GetNumVectorElements(vsz); _dbg_assert_msg_(JIT, jo_->enableVFPUSIMD, "Should not map simd regs when option is off."); if (!TryMapRegsVS(r, vsz, flags)) { // TODO: Could be more optimal. for (int i = 0; i < n; ++i) { StoreFromRegisterV(r[i]); } if (!TryMapRegsVS(r, vsz, flags)) { _dbg_assert_msg_(JIT, false, "MapRegsVS() failed on second try."); } } }
void ApplyPrefixST(float *v, u32 data, VectorSize size) { // Possible optimization shortcut: if (data == 0xe4) return; int n = GetNumVectorElements(size); float origV[4]; static const float constantArray[8] = {0.f, 1.f, 2.f, 0.5f, 3.f, 1.f/3.f, 0.25f, 1.f/6.f}; for (int i = 0; i < n; i++) { origV[i] = v[i]; } for (int i = 0; i < n; i++) { int regnum = (data >> (i*2)) & 3; int abs = (data >> (8+i)) & 1; int negate = (data >> (16+i)) & 1; int constants = (data >> (12+i)) & 1; if (!constants) { // Prefix may say "z, z, z, z" but if this is a pair, we force to x. // TODO: But some ops seem to use const 0 instead? if (regnum >= n) { ERROR_LOG_REPORT(CPU, "Invalid VFPU swizzle: %08x: %i / %d at PC = %08x (%s)", data, regnum, n, currentMIPS->pc, currentMIPS->DisasmAt(currentMIPS->pc)); //for (int i = 0; i < 12; i++) { // ERROR_LOG(CPU, " vfpuCtrl[%i] = %08x", i, currentMIPS->vfpuCtrl[i]); //} regnum = 0; } v[i] = origV[regnum]; if (abs) v[i] = fabs(v[i]); } else { v[i] = constantArray[regnum + (abs<<2)]; } if (negate) v[i] = -v[i]; } }
void Jit::Comp_VDot(u32 op) { // DISABLE; CONDITIONAL_DISABLE; // WARNING: No prefix support! if (js.MayHavePrefix()) { Comp_Generic(op); js.EatPrefix(); return; } int vd = _VD; int vs = _VS; int vt = _VT; VectorSize sz = GetVecSize(op); // TODO: Force read one of them into regs? probably not. u8 sregs[4], tregs[4]; GetVectorRegs(sregs, sz, vs); GetVectorRegs(tregs, sz, vt); // TODO: applyprefixST here somehow (shuffle, etc...) fpr.MapRegsV(sregs, sz, 0); fpr.MapRegsV(tregs, sz, 0); VMUL(S0, fpr.V(sregs[0]), fpr.V(tregs[0])); int n = GetNumVectorElements(sz); for (int i = 1; i < n; i++) { // sum += s[i]*t[i]; VMLA(S0, fpr.V(sregs[i]), fpr.V(tregs[i])); } fpr.ReleaseSpillLocks(); fpr.MapRegV(vd, MAP_NOINIT | MAP_DIRTY); // TODO: applyprefixD here somehow (write mask etc..) VMOV(fpr.V(vd), S0); fpr.ReleaseSpillLocks(); js.EatPrefix(); }
void ApplyPrefixD(float *v, VectorSize size, bool onlyWriteMask = false) { u32 data = currentMIPS->vfpuCtrl[VFPU_CTRL_DPREFIX]; if (!data || onlyWriteMask) return; int n = GetNumVectorElements(size); for (int i = 0; i < n; i++) { int sat = (data >> (i * 2)) & 3; if (sat == 1) { if (v[i] > 1.0f) v[i] = 1.0f; // This includes -0.0f -> +0.0f. if (v[i] <= 0.0f) v[i] = 0.0f; } else if (sat == 3) { if (v[i] > 1.0f) v[i] = 1.0f; if (v[i] < -1.0f) v[i] = -1.0f; } } }
bool FPURegCache::CanMapVS(const u8 *v, VectorSize vsz) { const int n = GetNumVectorElements(vsz); if (!jo_->enableVFPUSIMD) { return false; } if (IsMappedVS(v, vsz)) { return true; } else if (vregs[v[0]].lane != 0) { const MIPSCachedFPReg &v0 = vregs[v[0]]; _dbg_assert_msg_(JIT, v0.away, "Must be away when lane != 0"); _dbg_assert_msg_(JIT, v0.location.IsSimpleReg(), "Must be is register when lane != 0"); // Already in a different simd set. return false; } if (vregs[v[0]].locked) { // If it's locked, we can't mess with it. return false; } // Next, fail if any of the other regs are in simd currently. // TODO: Only if locked? Not sure if it will be worth breaking them anyway. for (int i = 1; i < n; ++i) { if (vregs[v[i]].lane != 0) { return false; } // If it's locked, in simd or not, we can't use it. if (vregs[v[i]].locked) { return false; } _assert_msg_(JIT, !vregs[v[i]].location.IsImm(), "Cannot handle imms in fp cache."); } return true; }
void ApplyPrefixST(float *v, u32 data, VectorSize size) { // Possible optimization shortcut: if (data == 0xe4) return; int n = GetNumVectorElements(size); float origV[4]; static const float constantArray[8] = {0.f, 1.f, 2.f, 0.5f, 3.f, 1.f/3.f, 0.25f, 1.f/6.f}; for (int i = 0; i < n; i++) { origV[i] = v[i]; } for (int i = 0; i < n; i++) { int regnum = (data >> (i*2)) & 3; int abs = (data >> (8+i)) & 1; int negate = (data >> (16+i)) & 1; int constants = (data >> (12+i)) & 1; if (!constants) { v[i] = origV[regnum]; if (abs) v[i] = fabs(v[i]); } else { v[i] = constantArray[regnum + (abs<<2)]; } if (negate) v[i] = -v[i]; } }
void Jit::Comp_VVectorInit(u32 op) { CONDITIONAL_DISABLE; // WARNING: No prefix support! if (js.MayHavePrefix()) { Comp_Generic(op); js.EatPrefix(); return; } switch ((op >> 16) & 0xF) { case 6: // v=zeros; break; //vzero MOVI2F(S0, 0.0f, R0); break; case 7: // v=ones; break; //vone MOVI2F(S0, 1.0f, R0); break; default: DISABLE; break; } VectorSize sz = GetVecSize(op); int n = GetNumVectorElements(sz); u8 dregs[4]; GetVectorRegsPrefixD(dregs, sz, _VD); fpr.MapRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY); for (int i = 0; i < n; ++i) VMOV(fpr.V(dregs[i]), S0); ApplyPrefixD(dregs, sz); fpr.ReleaseSpillLocks(); }
void Jit::Comp_VVectorInit(MIPSOpcode op) { CONDITIONAL_DISABLE; // WARNING: No prefix support! if (js.HasUnknownPrefix() || disablePrefixes) { DISABLE; } switch ((op >> 16) & 0xF) { case 6: // v=zeros; break; //vzero MOVI2F(S0, 0.0f, R0); break; case 7: // v=ones; break; //vone MOVI2F(S0, 1.0f, R0); break; default: DISABLE; break; } VectorSize sz = GetVecSize(op); int n = GetNumVectorElements(sz); u8 dregs[4]; GetVectorRegsPrefixD(dregs, sz, _VD); fpr.MapRegsAndSpillLockV(dregs, sz, MAP_NOINIT | MAP_DIRTY); for (int i = 0; i < n; ++i) VMOV(fpr.V(dregs[i]), S0); ApplyPrefixD(dregs, sz); fpr.ReleaseSpillLocksAndDiscardTemps(); }
void Jit::ApplyPrefixD(const u8 *vregs, VectorSize sz) { _assert_(js.prefixDFlag & ArmJitState::PREFIX_KNOWN); if (!js.prefixD) return; int n = GetNumVectorElements(sz); for (int i = 0; i < n; i++) { if (js.VfpuWriteMask(i)) continue; // TODO: These clampers are wrong - put this into google // and look at the plot: abs(x) - abs(x-0.5) + 0.5 // It's too steep. // Also, they mishandle NaN and Inf. int sat = (js.prefixD >> (i * 2)) & 3; if (sat == 1) { // clamped = fabs(x) - fabs(x-0.5f) + 0.5f; // [ 0, 1] fpr.MapRegV(vregs[i], MAP_DIRTY); MOVI2F(S0, 0.0f, R0); MOVI2F(S1, 1.0f, R0); VCMP(fpr.V(vregs[i]), S0); VMRS_APSR(); // Move FP flags from FPSCR to APSR (regular flags). SetCC(CC_LE); VMOV(fpr.V(vregs[i]), S0); SetCC(CC_AL); VCMP(fpr.V(vregs[i]), S1); VMRS_APSR(); // Move FP flags from FPSCR to APSR (regular flags). SetCC(CC_GT); VMOV(fpr.V(vregs[i]), S1); SetCC(CC_AL); /* VABS(S1, fpr.V(vregs[i])); // S1 = fabs(x) VSUB(fpr.V(vregs[i]), fpr.V(vregs[i]), S0); // S2 = fabs(x-0.5f) {VABD} VABS(fpr.V(vregs[i]), fpr.V(vregs[i])); VSUB(fpr.V(vregs[i]), S1, fpr.V(vregs[i])); // v[i] = S1 - S2 + 0.5f VADD(fpr.V(vregs[i]), fpr.V(vregs[i]), S0);*/ } else if (sat == 3) { fpr.MapRegV(vregs[i], MAP_DIRTY); MOVI2F(S0, -1.0f, R0); MOVI2F(S1, 1.0f, R0); VCMP(fpr.V(vregs[i]), S0); VMRS_APSR(); // Move FP flags from FPSCR to APSR (regular flags). SetCC(CC_LT); VMOV(fpr.V(vregs[i]), S0); SetCC(CC_AL); VCMP(fpr.V(vregs[i]), S1); VMRS_APSR(); // Move FP flags from FPSCR to APSR (regular flags). SetCC(CC_GT); VMOV(fpr.V(vregs[i]), S1); SetCC(CC_AL); // clamped = fabs(x) - fabs(x-1.0f); // [-1, 1] /* fpr.MapRegV(vregs[i], MAP_DIRTY); MOVI2F(S0, 1.0f, R0); VABS(S1, fpr.V(vregs[i])); // S1 = fabs(x) VSUB(fpr.V(vregs[i]), fpr.V(vregs[i]), S0); // S2 = fabs(x-1.0f) {VABD} VABS(fpr.V(vregs[i]), fpr.V(vregs[i])); VSUB(fpr.V(vregs[i]), S1, fpr.V(vregs[i])); // v[i] = S1 - S2 */ } } }
void Jit::Comp_VecDo3(u32 op) { CONDITIONAL_DISABLE; DISABLE; // WARNING: No prefix support! if (js.MayHavePrefix()) { Comp_Generic(op); js.EatPrefix(); return; } int vd = _VD; int vs = _VS; int vt = _VT; void (ARMXEmitter::*triop)(ARMReg, ARMReg, ARMReg) = NULL; switch (op >> 26) { case 24: //VFPU0 switch ((op >> 23)&7) { case 0: // d[i] = s[i] + t[i]; break; //vadd triop = &ARMXEmitter::VADD; break; case 1: // d[i] = s[i] - t[i]; break; //vsub triop = &ARMXEmitter::VSUB; break; case 7: // d[i] = s[i] / t[i]; break; //vdiv triop = &ARMXEmitter::VDIV; break; } break; case 25: //VFPU1 switch ((op >> 23)&7) { case 0: // d[i] = s[i] * t[i]; break; //vmul triop = &ARMXEmitter::VMUL; break; } break; } if (!triop) { DISABLE; } VectorSize sz = GetVecSize(op); int n = GetNumVectorElements(sz); u8 sregs[4], tregs[4], dregs[4]; GetVectorRegsPrefixS(sregs, sz, _VS); GetVectorRegsPrefixT(tregs, sz, _VT); GetVectorRegsPrefixD(dregs, sz, _VD); MIPSReg tempregs[4]; for (int i = 0; i < n; i++) { if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs, n, tregs)) { tempregs[i] = fpr.GetTempV(); } else { fpr.MapRegV(dregs[i], (dregs[i] == sregs[i] || dregs[i] == tregs[i] ? 0 : MAP_NOINIT) | MAP_DIRTY); tempregs[i] = dregs[i]; } } for (int i = 0; i < n; i++) { fpr.SpillLockV(sregs[i]); fpr.SpillLockV(tregs[i]); fpr.MapRegV(sregs[i]); fpr.MapRegV(tregs[i]); fpr.MapRegV(tempregs[i]); (this->*triop)(fpr.V(tempregs[i]), fpr.V(sregs[i]), fpr.V(tregs[i])); fpr.ReleaseSpillLockV(sregs[i]); fpr.ReleaseSpillLockV(tregs[i]); } fpr.MapRegsV(dregs, sz, MAP_DIRTY); for (int i = 0; i < n; i++) { if (dregs[i] != tempregs[i]) VMOV(fpr.V(dregs[i]), fpr.V(tempregs[i])); } ApplyPrefixD(dregs, sz); fpr.ReleaseSpillLocks(); js.EatPrefix(); }
void FPURegCache::MapRegsV(const u8 *v, VectorSize sz, int flags) { SpillLockV(v, sz); for (int i = 0; i < GetNumVectorElements(sz); i++) { BindToRegister(v[i] + 32, (flags & MAP_NOINIT) == 0, (flags & MAP_DIRTY) != 0); } }
void Jit::Comp_VV2Op(u32 op) { CONDITIONAL_DISABLE; DISABLE; if (js.HasUnknownPrefix()) DISABLE; VectorSize sz = GetVecSize(op); int n = GetNumVectorElements(sz); u8 sregs[4], dregs[4]; GetVectorRegsPrefixS(sregs, sz, _VS); GetVectorRegsPrefixD(dregs, sz, _VD); ARMReg tempxregs[4]; for (int i = 0; i < n; ++i) { if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs)) { int reg = fpr.GetTempV(); fpr.MapRegV(reg, MAP_NOINIT | MAP_DIRTY); fpr.SpillLockV(reg); tempxregs[i] = fpr.V(reg); } else { fpr.MapRegV(dregs[i], (dregs[i] == sregs[i] ? 0 : MAP_NOINIT) | MAP_DIRTY); fpr.SpillLockV(dregs[i]); tempxregs[i] = fpr.V(dregs[i]); } } // Warning: sregs[i] and tempxregs[i] may be the same reg. // Helps for vmov, hurts for vrcp, etc. for (int i = 0; i < n; ++i) { switch ((op >> 16) & 0x1f) { case 0: // d[i] = s[i]; break; //vmov // Probably for swizzle. VMOV(tempxregs[i], fpr.V(sregs[i])); break; case 1: // d[i] = fabsf(s[i]); break; //vabs //if (!fpr.V(sregs[i]).IsSimpleReg(tempxregs[i])) VABS(tempxregs[i], fpr.V(sregs[i])); break; case 2: // d[i] = -s[i]; break; //vneg VNEG(tempxregs[i], fpr.V(sregs[i])); break; case 4: // if (s[i] < 0) d[i] = 0; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break; // vsat0 DISABLE; break; case 5: // if (s[i] < -1.0f) d[i] = -1.0f; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break; // vsat1 DISABLE; break; case 16: // d[i] = 1.0f / s[i]; break; //vrcp MOVI2F(S0, 1.0f, R0); VDIV(tempxregs[i], S0, fpr.V(sregs[i])); break; case 17: // d[i] = 1.0f / sqrtf(s[i]); break; //vrsq MOVI2F(S0, 1.0f, R0); VSQRT(S1, fpr.V(sregs[i])); VDIV(tempxregs[i], S0, S1); break; case 18: // d[i] = sinf((float)M_PI_2 * s[i]); break; //vsin DISABLE; break; case 19: // d[i] = cosf((float)M_PI_2 * s[i]); break; //vcos DISABLE; break; case 20: // d[i] = powf(2.0f, s[i]); break; //vexp2 DISABLE; break; case 21: // d[i] = logf(s[i])/log(2.0f); break; //vlog2 DISABLE; break; case 22: // d[i] = sqrtf(s[i]); break; //vsqrt VSQRT(tempxregs[i], fpr.V(sregs[i])); VABS(tempxregs[i], tempxregs[i]); break; case 23: // d[i] = asinf(s[i] * (float)M_2_PI); break; //vasin DISABLE; break; case 24: // d[i] = -1.0f / s[i]; break; // vnrcp MOVI2F(S0, -1.0f, R0); VDIV(tempxregs[i], S0, fpr.V(sregs[i])); break; case 26: // d[i] = -sinf((float)M_PI_2 * s[i]); break; // vnsin DISABLE; break; case 28: // d[i] = 1.0f / expf(s[i] * (float)M_LOG2E); break; // vrexp2 DISABLE; break; } } fpr.MapRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY); for (int i = 0; i < n; ++i) { VMOV(fpr.V(dregs[i]), tempxregs[i]); } ApplyPrefixD(dregs, sz); fpr.ReleaseSpillLocks(); }
void FPURegCache::SpillLockV(const u8 *v, VectorSize sz) { for (int i = 0; i < GetNumVectorElements(sz); i++) { vregs[v[i]].locked = true; } }
void FPURegCache::ReleaseSpillLockV(const u8 *vec, VectorSize sz) { for (int i = 0; i < GetNumVectorElements(sz); i++) { vregs[vec[i]].locked = false; } }
void ArmJit::NEONApplyPrefixD(DestARMReg dest) { // Apply clamps to dest.rd int n = GetNumVectorElements(dest.sz); int sat1_mask = 0; int sat3_mask = 0; int full_mask = (1 << n) - 1; for (int i = 0; i < n; i++) { int sat = (js.prefixD >> (i * 2)) & 3; if (sat == 1) sat1_mask |= 1 << i; if (sat == 3) sat3_mask |= 1 << i; } if (sat1_mask && sat3_mask) { // Why would anyone do this? ELOG("PREFIXD: Can't have both sat[0-1] and sat[-1-1] at the same time yet"); } if (sat1_mask) { if (sat1_mask != full_mask) { ELOG("PREFIXD: Can't have partial sat1 mask yet (%i vs %i)", sat1_mask, full_mask); } if (IsD(dest.rd)) { VMOV_immf(D0, 0.0); VMOV_immf(D1, 1.0); VMAX(F_32, dest.rd, dest.rd, D0); VMIN(F_32, dest.rd, dest.rd, D1); } else { VMOV_immf(Q0, 1.0); VMIN(F_32, dest.rd, dest.rd, Q0); VMOV_immf(Q0, 0.0); VMAX(F_32, dest.rd, dest.rd, Q0); } } if (sat3_mask && sat1_mask != full_mask) { if (sat3_mask != full_mask) { ELOG("PREFIXD: Can't have partial sat3 mask yet (%i vs %i)", sat3_mask, full_mask); } if (IsD(dest.rd)) { VMOV_immf(D0, 0.0); VMOV_immf(D1, 1.0); VMAX(F_32, dest.rd, dest.rd, D0); VMIN(F_32, dest.rd, dest.rd, D1); } else { VMOV_immf(Q0, 1.0); VMIN(F_32, dest.rd, dest.rd, Q0); VMOV_immf(Q0, -1.0); VMAX(F_32, dest.rd, dest.rd, Q0); } } // Check for actual mask operation (unrelated to the "masks" above). if (dest.backingRd != dest.rd) { // This means that we need to apply the write mask, from rd to backingRd. // What a pain. We can at least shortcut easy cases like half the register. // And we can generate the masks easily with some of the crazy vector imm modes. (bits2bytes for example). // So no need to load them from RAM. int writeMask = (~(js.prefixD >> 8)) & 0xF; if (writeMask == 3) { ILOG("Doing writemask = 3"); VMOV(D_0(dest.rd), D_0(dest.backingRd)); } else { // TODO ELOG("PREFIXD: Arbitrary write masks not supported (%i / %i)", writeMask, full_mask); VMOV(dest.backingRd, dest.rd); } }
void FPURegCache::MapRegsV(const u8 *r, VectorSize sz, int flags) { SpillLockV(r, sz); for (int i = 0; i < GetNumVectorElements(sz); i++) { MapReg(r[i] + 32, (flags & MAP_NOINIT) != MAP_NOINIT, (flags & MAP_DIRTY) != 0); } }