static inline AA1 case_1(AA1 & x, int32_t sn, const AA1 & n) { typedef typename meta::scalar_of<AA1>::type sA1; /* Power series expansion */ AA1 eqzx = is_eqz(x); x = seladd(is_eqz(x), x, One<A1>()); //loop is infinite for x == 0 sA1 psi1 = Zero<sA1>(); for( int32_t i=sn-1; i; --i ) psi1 += rec((sA1)i); AA1 psi = -Euler<A1>()-nt2::log(x)+splat<A1>(psi1); AA1 t; AA1 z = -x; AA1 xk = Zero<A1>(); AA1 yk = One<A1>(); AA1 pk = One<A1>() - n; AA1 ans = ( sn == 1 ) ? Zero<A1>() : rec(pk); do { xk += One<AA1>(); yk *= z/xk; pk += One<AA1>(); ans = seladd(is_nez(pk), ans, yk/pk); t = select(is_nez(ans), nt2::abs(yk/ans), One<AA1>()); } while( nt2::bitwise_any(gt(t, Halfeps<A1>()))); return seladd(eqzx,(nt2::powi(z, sn-1) * psi / nt2::gamma(n)) - ans, Inf<A1>()); //TO DO pow->powi and gamma splatted from scalar or mere factorial call }
static BOOST_FORCEINLINE i_t select(A0& x) { // find significand in antilog table A[] i_t i = One<i_t>(); i = if_else((x <= twomio16(Nine<i_t>())) , Nine<i_t>(), i); i = seladd ((x <= twomio16(i+Four<i_t>())), i, Four<i_t>()); i = seladd ((x <= twomio16(i+Two<i_t>())) , i, Two<i_t>() ); i = if_else((x >= twomio16(One<i_t>())) , Mone<i_t>(), i); i = inc(i); A0 tmp = twomio16(i); x -= tmp; x -= continuation(shr(i, 1)); x /= tmp; return i; }
static inline A0_n kernel_atan(const A0_n a0_n) { typedef typename meta::scalar_of<A0>::type sA0; const A0 tan3pio8 = double_constant<A0, 0x4003504f333f9de6ll>(); const A0 tanpio8 = double_constant<A0, 0x3fda827999fcef31ll>(); const A0 a0 = {a0_n}; const A0 x = nt2::abs(a0); const bA0 flag1 = lt(x, tan3pio8); const bA0 flag2 = logical_and(ge(x, tanpio8), flag1); A0 yy = if_zero_else(flag1, Pio_2<A0>()); yy = select(flag2, Pio_4<A0>(), yy); A0 xx = select(flag1, x, -rec(x)); xx = select(flag2, minusone(x)/oneplus(x),xx); A0 z = sqr(xx); z = z*horner< NT2_HORNER_COEFF_T(sA0, 5, (0xbfec007fa1f72594ll, 0xc03028545b6b807all, 0xc052c08c36880273ll, 0xc05eb8bf2d05ba25ll, 0xc0503669fd28ec8ell) )>(z)/ horner< NT2_HORNER_COEFF_T(sA0, 6, (0x3ff0000000000000ll, 0x4038dbc45b14603cll, 0x4064a0dd43b8fa25ll, 0x407b0e18d2e2be3bll, 0x407e563f13b049eall, 0x4068519efbbd62ecll) )>(z); z = fma(xx, z, xx); const A0 morebits = double_constant<A0, 0x3c91a62633145c07ll>(); z = seladd(flag2, z, mul(Half<A0>(), morebits)); z = z+if_zero_else(flag1, morebits); return yy + z; }
static inline A0 log10(const A0& a0) { A0 dk, hfsq, s, R, f; kernel_log(a0, dk, hfsq, s, R, f); A0 y2 = -(hfsq-(s*(hfsq+R))-f)*Invlog_10<A0>()+dk*Log_2olog_10<A0>(); A0 y1 = a0-rec(abs(a0));// trick to reduce selection testing return seladd(is_inf(y1),b_or(y2, b_or(is_ltz(a0), is_nan(a0))),y1); }
static inline A0 log(const A0& a0) { // ln(2)hi = 6.93147180369123816490e-01 or 0x3fe62e42fee00000 // ln(2)lo = 1.90821492927058770002e-10 or 0x3dea39ef35793c76 A0 dk, hfsq, s, R, f; kernel_log(a0, dk, hfsq, s, R, f); A0 y2 = mul(dk, double_constant<A0, 0x3fe62e42fee00000ll>())- ((hfsq-(s*(hfsq+R)+mul(dk,double_constant<A0, 0x3dea39ef35793c76ll>())))-f); A0 y1 = a0-rec(abs(a0));// trick to reduce selection testing return seladd(is_inf(y1),b_or(y2, b_or(is_ltz(a0), is_nan(a0))),y1); }
static inline A0 log2(const A0& a0) { A0 x, fe, x2, y; kernel_log(a0, fe, x, x2, y); y = madd(Mhalf<A0>(),x2, y); // multiply log of fraction by log2(e) A0 z = madd(x,single_constant<A0, 0x3ee2a8ed>(),mul(y,single_constant<A0, 0x3ee2a8ed>()));// 0.44269504088896340735992 A0 z1 = ((z+y)+x)+fe; A0 y1 = a0-rec(abs(a0)); // trick to reduce selection testing return seladd(is_inf(y1),b_or(z1, b_or(is_ltz(a0), is_nan(a0))),y1); }
static inline A0 log(const A0& a0) { A0 x, fe, x2, y; kernel_log(a0, fe, x, x2, y); y = madd(fe, single_constant<A0, 0xb95e8083>(), y); y = madd(Mhalf<A0>(), x2, y); A0 z = x + y; A0 y1 = a0-rec(abs(a0));// trick to reduce selection testing A0 y2 = madd(single_constant<A0, 0x3f318000>(), fe, z); y2 = if_nan_else(logical_or(is_ltz(a0), is_nan(a0)), y2); return seladd(is_inf(y1), y2, y1); }
static inline A0 log10(const A0& a0) { A0 x, fe, x2, y; kernel_log(a0, fe, x, x2, y); y = amul(y, -Half<A0>(), x2); // multiply log of fraction by log10(e) and base 2 exponent by log10(2) A0 z = mul(x+y, single_constant<A0, 0x3a37b152>());//7.00731903251827651129E-4f // log10(e)lo z = amul(z, y, single_constant<A0, 0x3ede0000>()); //4.3359375E-1f // log10(e)hi z = amul(z, x, single_constant<A0, 0x3ede0000>()); z = amul(z, fe, single_constant<A0, 0x39826a14>());//3.0078125E-1f // log10(2)hi z = amul(z, fe, single_constant<A0, 0x3e9a0000>());//2.48745663981195213739E-4f // log10(2)lo A0 y1 = a0-rec(abs(a0)); // trick to reduce selection testing return seladd(is_inf(y1), b_or(z, b_or(is_ltz(a0), is_nan(a0))),y1); }
static inline A0_n acos(const A0_n a0_n) { // 2130706432 values computed. // 1968272987 values (92.38%) within 0.0 ULPs // 162433445 values (7.62%) within 0.5 ULPs // 8.5 cycles/element SSE4.2 g++-4.8 const A0 a0 = a0_n; A0 x = nt2::abs(a0); bA0 x_larger_05 = gt(x, nt2::Half<A0>()); x = if_else(x_larger_05, nt2::sqrt(fma(nt2::Mhalf<A0>(), x, nt2::Half<A0>())), a0); x = asin(x); x = seladd(x_larger_05, x, x); x = nt2::if_else(lt(a0, nt2::Mhalf<A0>()), nt2::Pi<A0>()-x, x); return nt2::if_else(x_larger_05, x, nt2::Pio_2<A0>()-x); }
static inline A0 log(const A0& a0) { A0 x, fe, x2, y; kernel_log(a0, fe, x, x2, y); y = madd(fe, single_constant<A0, 0xb95e8083>(), y); y = madd(Mhalf<A0>(), x2, y); A0 z = x + y; // std::cout << "fe " << fe << std::endl; // std::cout << "z " << z << std::endl; // std::cout << "a0 " << a0 << std::endl; // std::cout << "rec(a0) " << rec(a0) << std::endl; A0 y1 = a0-rec(abs(a0));// trick to reduce selection testing A0 y2 = madd(single_constant<A0, 0x3f318000>(), fe, z); // std::cout << "y1 " << y1 << std::endl; // std::cout << "y2 " << y2 << std::endl; return seladd(is_inf(y1),b_or(y2, b_or(is_ltz(a0), is_nan(a0))),y1); }
static inline AA1 case_2(const AA1 & x, int32_t /*sn*/, const AA1 & n) { typedef typename meta::scalar_of<AA1>::type sAA1; int32_t sk = 1; AA1 t; AA1 pkm2 = One<AA1>(); AA1 qkm2 = x; AA1 pkm1 = One<AA1>(); AA1 qkm1 = x + n; AA1 ans = pkm1/qkm1; do { AA1 test = is_nez(splat<AA1>(is_odd(++sk))); AA1 k_2 = splat<AA1>(sk >> 1); AA1 yk = sel(test, One<AA1>(), x); AA1 xk = seladd(test, k_2, n); AA1 pk = pkm1 * yk + pkm2 * xk; AA1 qk = qkm1 * yk + qkm2 * xk; AA1 r = pk/qk; test = is_nez(qk); t = sel(test,nt2::abs((ans-r)/r),One<AA1>()); ans = sel(test, r, ans); pkm2 = pkm1; pkm1 = pk; qkm2 = qkm1; qkm1 = qk; test = gt(nt2::abs(pk), Expnibig<AA1>()); AA1 fac = sel(test, Halfeps<AA1>(), One<AA1>()); pkm2 *= fac; pkm1 *= fac; qkm2 *= fac; qkm1 *= fac; } while( nt2::bitwise_any(gt(t, Halfeps<AA1>())) ); return ans*nt2::exp(-x); }
static inline A0 atan(const A0& a0) { typedef typename meta::scalar_of<A0>::type sA0; static const A0 tan3pio8 = double_constant<A0, 0x4003504f333f9de6ll>(); static const A0 Twothird = double_constant<A0, 0x3fe51eb851eb851fll>(); static const A0 tanpio8 = double_constant<A0, 0x3fda827999fcef31ll>(); A0 x = abs(a0); const A0 flag1 = lt(x, double_constant<A0, 0x4003504f333f9de6ll>()); //tan3pio8 const A0 flag2 = b_and(ge(x, double_constant<A0, 0x3fda827999fcef31ll>()), flag1); //tanpio8 A0 yy = b_notand(flag1, Pio_2<A0>()); yy = select(flag2, Pio_4<A0>(), yy); A0 xx = select(flag1, x, -rec(x)); xx = select(flag2, minusone(x)/oneplus(x),xx); A0 z = sqr(xx); z = z*horner< NT2_HORNER_COEFF_T(sA0, 5, (0xbfec007fa1f72594ll, 0xc03028545b6b807all, 0xc052c08c36880273ll, 0xc05eb8bf2d05ba25ll, 0xc0503669fd28ec8ell) )>(z)/ horner< NT2_HORNER_COEFF_T(sA0, 6, (0x3ff0000000000000ll, 0x4038dbc45b14603cll, 0x4064a0dd43b8fa25ll, 0x407b0e18d2e2be3bll, 0x407e563f13b049eall, 0x4068519efbbd62ecll) )>(z); z = fma(xx, z, xx); // static const A0 morebits = double_constant<A0, 0x3c91a62633145c07ll>(); z = seladd(flag2, z, mul(Half<A0>(), double_constant<A0, 0x3c91a62633145c07ll>())); z = z+b_notand(flag1, double_constant<A0, 0x3c91a62633145c07ll>()); yy = yy + z; return b_xor(yy, bitofsign(a0)); }
{ template<class Sig> struct result; template<class This,class A0> struct result<This(A0)> : meta::strip<A0>{};// NT2_FUNCTOR_CALL(1) { A0 const na = isnez(a0); A0 n = add(shri(a0, 4), Four<A0>()); A0 n1 = shri(n+a0/n, 1); A0 msk = b_and(isle(n1,n), na); n = select(msk,n1,n); n1 = sqr(n); msk = b_or(isgt(n1,a0), b_and(iseqz(n1), na)); n = seladd( msk, n, Mone<A0>()); return seladd(na, Zero<A0>(), n); } }; } } ///////////////////////////////////////////////////////////////////////////// // Implementation when type A0 is arithmetic_ ///////////////////////////////////////////////////////////////////////////// NT2_REGISTER_DISPATCH(tag::sqrt_, tag::cpu_, (A0), ((simd_<arithmetic_<A0>,tag::xop_>)) ); namespace nt2 { namespace ext