ExpCode(const ExpVar<N> *self) { Xbyak::util::Cpu cpu; bool useSSE41 = cpu.has(Xbyak::util::Cpu::tSSE41); try { makeExp(self, useSSE41); exp_ = (float(*)(float))getCode(); align(16); exp_ps_ = (__m128(*)(__m128))getCurr(); makeExpPs(self, useSSE41); return; } catch (Xbyak::Error err) { fprintf(stderr, "ExpCode ERR:%s(%d)\n", Xbyak::ConvertErrorToString(err), err); } catch (...) { fprintf(stderr, "ExpCode ERR:unknown error\n"); } ::exit(1); }
void makeExpPs(const ExpVar<N> *self, const Xbyak::util::Cpu& cpu) { typedef ExpVar<N> Self; using namespace local; using namespace Xbyak; inLocalLabel(); #ifdef XBYAK64 const Reg64& base = rcx; const Reg64& a = rax; const Reg64& d = rdx; #else const Reg32& base = ecx; const Reg32& a = eax; const Reg32& d = edx; #endif /* if abs(x) >= maxX then x = max(min(x, maxX), -maxX) and try minps, maxps are very slow then avoid them */ const bool useSSE41 = cpu.has(Xbyak::util::Cpu::tSSE41); #if defined(XBYAK64_WIN) && !defined(__INTEL_COMPILER) movaps(xm0, ptr [rcx]); #endif mov(base, (size_t)self); L(".retry"); movaps(xm5, xm0); andps(xm5, ptr [base + offsetof(Self, i7fffffff)]); movaps(xm3, ptr [base + offsetof(Self, a)]); movaps(xm4, ptr [base + offsetof(Self, b)]); pcmpgtd(xm5, ptr [base + offsetof(Self, maxX)]); mulps(xm3, xm0); movaps(xm1, ptr [base + offsetof(Self, i127s)]); pmovmskb(eax, xm5); movaps(xm5, ptr [base + offsetof(Self, mask_s)]); cvtps2dq(xm2, xm3); pand(xm5, xm2); cvtdq2ps(xm3, xm2); test(eax, eax); jnz(".overflow"); paddd(xm1, xm2); movd(eax, xm5); mulps(xm4, xm3); pextrw(edx, xm5, 2); subps(xm0, xm4); movd(xm4, ptr [base + a * 4 + offsetof(Self, tbl)]); addps(xm0, ptr [base + offsetof(Self, f1)]); pextrw(eax, xm5, 4); if (useSSE41) { pinsrd(xm4, ptr [base + d * 4 + offsetof(Self, tbl)], 1); } else { movd(xm3, ptr [base + d * 4 + offsetof(Self, tbl)]); movlhps(xm4, xm3); } pextrw(edx, xm5, 6); psrld(xm1, self->s); pslld(xm1, 23); if (useSSE41) { pinsrd(xm4, ptr [base + a * 4 + offsetof(Self, tbl)], 2); pinsrd(xm4, ptr [base + d * 4 + offsetof(Self, tbl)], 3); } else { movd(xm2, ptr [base + a * 4 + offsetof(Self, tbl)]); movd(xm3, ptr [base + d * 4 + offsetof(Self, tbl)]); movlhps(xm2, xm3); shufps(xm4, xm2, MIE_PACK(2, 0, 2, 0)); } por(xm1, xm4); mulps(xm0, xm1); ret(); L(".overflow"); minps(xm0, ptr [base + offsetof(Self, maxX)]); maxps(xm0, ptr [base + offsetof(Self, minX)]); jmp(".retry"); outLocalLabel(); }