static inline void kernel_log(const A0& a0, A0& dk, A0& hfsq, A0& s, A0& R, A0& f) { A0 x; int_type k(fast_frexp(a0, x)); const int_type x_lt_sqrthf = -isgt(Sqrt_2o_2<A0>(), x); k += x_lt_sqrthf; f = minusone(x+b_and(x, genmask<A0>(x_lt_sqrthf))); dk = tofloat(k); s = f/add(Two<A0>(),f); A0 z = sqr(s); A0 w = sqr(z); A0 t1= w*horner<NT2_HORNER_COEFF_T(A0, 3, (0x3fc39a09d078c69fll, 0x3fcc71c51d8e78afll, 0x3fd999999997fa04ll) )> (w); A0 t2= z*horner<NT2_HORNER_COEFF_T(A0, 4, (0x3fc2f112df3e5244ll, 0x3fc7466496cb03dell, 0x3fd2492494229359ll, 0x3fe5555555555593ll) )> (w); R = t2+t1; hfsq = mul(Half<A0>(), sqr(f)); }
tag::cpu_, Dummy> : callable { template<class Sig> struct result; template<class This,class A0> struct result<This(A0)> : meta::strip<A0>{};// NT2_FUNCTOR_CALL(1) { A0 const na = isnez(a0); A0 n = add(shri(a0, 4), Four<A0>()); A0 n1 = shri(n+a0/n, 1); A0 msk = b_and(isle(n1,n), na); n = select(msk,n1,n); n1 = sqr(n); msk = b_or(isgt(n1,a0), b_and(iseqz(n1), na)); n = seladd( msk, n, Mone<A0>()); return seladd(na, Zero<A0>(), n); } }; } } ///////////////////////////////////////////////////////////////////////////// // Implementation when type A0 is arithmetic_ ///////////////////////////////////////////////////////////////////////////// NT2_REGISTER_DISPATCH(tag::sqrt_, tag::cpu_, (A0), ((simd_<arithmetic_<A0>,tag::xop_>)) );