예제 #1
0
 ExpCode(const ExpVar<N> *self)
 {
     Xbyak::util::Cpu cpu;
     bool useSSE41 = cpu.has(Xbyak::util::Cpu::tSSE41);
     try {
         makeExp(self, useSSE41);
         exp_ = (float(*)(float))getCode();
         align(16);
         exp_ps_ = (__m128(*)(__m128))getCurr();
         makeExpPs(self, useSSE41);
         return;
     } catch (Xbyak::Error err) {
         fprintf(stderr, "ExpCode ERR:%s(%d)\n", Xbyak::ConvertErrorToString(err), err);
     } catch (...) {
         fprintf(stderr, "ExpCode ERR:unknown error\n");
     }
     ::exit(1);
 }
예제 #2
0
	void makeExpPs(const ExpVar<N> *self, const Xbyak::util::Cpu& cpu)
	{
		typedef ExpVar<N> Self;
		using namespace local;
		using namespace Xbyak;

		inLocalLabel();
#ifdef XBYAK64
		const Reg64& base = rcx;
		const Reg64& a = rax;
		const Reg64& d = rdx;
#else
		const Reg32& base = ecx;
		const Reg32& a = eax;
		const Reg32& d = edx;
#endif

/*
	if abs(x) >= maxX then x = max(min(x, maxX), -maxX) and try
	minps, maxps are very slow then avoid them
*/
		const bool useSSE41 = cpu.has(Xbyak::util::Cpu::tSSE41);
#if defined(XBYAK64_WIN) && !defined(__INTEL_COMPILER)
		movaps(xm0, ptr [rcx]);
#endif
		mov(base, (size_t)self);
	L(".retry");
		movaps(xm5, xm0);
		andps(xm5, ptr [base + offsetof(Self, i7fffffff)]);
		movaps(xm3, ptr [base + offsetof(Self, a)]);
		movaps(xm4, ptr [base + offsetof(Self, b)]);
		pcmpgtd(xm5, ptr [base + offsetof(Self, maxX)]);
		mulps(xm3, xm0);
		movaps(xm1, ptr [base + offsetof(Self, i127s)]);
		pmovmskb(eax, xm5);
		movaps(xm5, ptr [base + offsetof(Self, mask_s)]);
		cvtps2dq(xm2, xm3);
		pand(xm5, xm2);
		cvtdq2ps(xm3, xm2);
		test(eax, eax);
		jnz(".overflow");
		paddd(xm1, xm2);
		movd(eax, xm5);
		mulps(xm4, xm3);
		pextrw(edx, xm5, 2);
		subps(xm0, xm4);
		movd(xm4, ptr [base + a * 4 + offsetof(Self, tbl)]);
		addps(xm0, ptr [base + offsetof(Self, f1)]);
		pextrw(eax, xm5, 4);
		if (useSSE41) {
			pinsrd(xm4, ptr [base + d * 4 + offsetof(Self, tbl)], 1);
		} else {
			movd(xm3, ptr [base + d * 4 + offsetof(Self, tbl)]);
			movlhps(xm4, xm3);
		}
		pextrw(edx, xm5, 6);
		psrld(xm1, self->s);
		pslld(xm1, 23);
		if (useSSE41) {
			pinsrd(xm4, ptr [base + a * 4 + offsetof(Self, tbl)], 2);
			pinsrd(xm4, ptr [base + d * 4 + offsetof(Self, tbl)], 3);
		} else {
			movd(xm2, ptr [base + a * 4 + offsetof(Self, tbl)]);
			movd(xm3, ptr [base + d * 4 + offsetof(Self, tbl)]);
			movlhps(xm2, xm3);
			shufps(xm4, xm2, MIE_PACK(2, 0, 2, 0));
		}
		por(xm1, xm4);
		mulps(xm0, xm1);
		ret();
	L(".overflow");
		minps(xm0, ptr [base + offsetof(Self, maxX)]);
		maxps(xm0, ptr [base + offsetof(Self, minX)]);
		jmp(".retry");
		outLocalLabel();
	}