always_inline VecType vec_exp_tanh_float(VecType const & arg) { typedef typename VecType::int_vec int_vec; /* Express e**x = e**g 2**n * = e**g e**( n loge(2) ) * = e**( g + n loge(2) ) */ // black magic VecType x = arg; VecType z = round(VecType(1.44269504088896341f) * x); int_vec n = z.truncate_to_int(); x -= z*VecType(0.693359375f); x -= z*VecType(-2.12194440e-4f); /* Theoretical peak relative error in [-0.5, +0.5] is 3.5e-8. */ VecType p = 1.f + x * (1.00000035762786865234375f + x * (0.4999996721744537353515625f + x * (0.16665561497211456298828125f + x * (4.167006909847259521484375e-2f + x * (8.420792408287525177001953125e-3f + x * 1.386119984090328216552734375e-3f))))); /* multiply by power of 2 */ VecType approx = ldexp_float(p, n); return approx; }
always_inline VecType vec_sin_float(VecType const & arg) { typedef typename VecType::int_vec int_vec; const typename VecType::float_type four_over_pi = 1.27323954473516268615107010698011489627567716592367; VecType sign = arg & VecType::gen_sign_mask(); VecType abs_arg = arg & VecType::gen_abs_mask(); VecType y = abs_arg * four_over_pi; int_vec j = y.truncate_to_int(); /* cephes: j=(j+1) & (~1) */ j = (j + int_vec(1)) & int_vec(~1); y = j.convert_to_float(); /* sign based on quadrant */ VecType swap_sign_bit = slli(j & int_vec(4), 29); sign = sign ^ swap_sign_bit; /* polynomial mask */ VecType poly_mask = VecType (mask_eq(j & int_vec(2), int_vec(0))); /* black magic */ static float DP1 = 0.78515625; static float DP2 = 2.4187564849853515625e-4; static float DP3 = 3.77489497744594108e-8; VecType base = ((abs_arg - y * DP1) - y * DP2) - y * DP3; /* [0..pi/4] */ VecType z = base * base; VecType p1 = (( 2.443315711809948E-005 * z - 1.388731625493765E-003) * z + 4.166664568298827E-002) * z * z -0.5f * z + 1.0 ; /* [pi/4..pi/2] */ VecType p2 = ((-1.9515295891E-4 * z + 8.3321608736E-3) * z - 1.6666654611E-1) * z * base + base; VecType approximation = select(p1, p2, poly_mask); return approximation ^ sign; }
always_inline VecType vec_exp_float(VecType const & arg) { typedef typename VecType::int_vec int_vec; /* Express e**x = e**g 2**n * = e**g e**( n loge(2) ) * = e**( g + n loge(2) ) */ // black magic VecType x = arg; VecType z = round(VecType(1.44269504088896341f) * x); int_vec n = z.truncate_to_int(); x -= z*VecType(0.693359375f); x -= z*VecType(-2.12194440e-4f); /* Theoretical peak relative error in [-0.5, +0.5] is 3.5e-8. */ VecType p = VecType(VecType::gen_one()) + x * (1.00000035762786865234375f + x * (0.4999996721744537353515625f + x * (0.16665561497211456298828125f + x * (4.167006909847259521484375e-2f + x * (8.420792408287525177001953125e-3f + x * 1.386119984090328216552734375e-3f))))); /* multiply by power of 2 */ VecType approx = ldexp_float(p, n); /* handle min/max boundaries */ const VecType maxlogf(88.72283905206835f); // const VecType minlogf(-103.278929903431851103f); const VecType minlogf = -maxlogf; const VecType max_float(std::numeric_limits<float>::max()); const VecType zero = VecType::gen_zero(); VecType too_large = mask_gt(arg, maxlogf); VecType too_small = mask_lt(arg, minlogf); VecType ret = select(approx, max_float, too_large); ret = select(ret, zero, too_small); return ret; }
always_inline VecType vec_tan_float(VecType const & arg) { typedef typename VecType::int_vec int_vec; const typename VecType::float_type four_over_pi = 1.27323954473516268615107010698011489627567716592367; VecType sign = arg & VecType::gen_sign_mask(); VecType abs_arg = arg & VecType::gen_abs_mask(); VecType y = abs_arg * four_over_pi; int_vec j = y.truncate_to_int(); /* cephes: j=(j+1) & (~1) */ j = (j + int_vec(1)) & int_vec(~1); y = j.convert_to_float(); /* approximation mask */ VecType poly_mask = VecType (mask_eq(j & int_vec(2), int_vec(0))); /* black magic */ static float DP1 = 0.78515625; static float DP2 = 2.4187564849853515625e-4; static float DP3 = 3.77489497744594108e-8; VecType base = ((abs_arg - y * DP1) - y * DP2) - y * DP3; VecType x = base; VecType x2 = x*x; // sollya: fpminimax(tan(x), [|3,5,7,9,11,13|], [|24...|], [-pi/4,pi/4], x); VecType approx = x + x * x2 * (0.3333315551280975341796875 + x2 * (0.1333882510662078857421875 + x2 * (5.3409568965435028076171875e-2 + x2 * (2.443529665470123291015625e-2 + x2 * (3.1127030961215496063232421875e-3 + x2 * 9.3892104923725128173828125e-3))))); //VecType recip = -reciprocal(approx); VecType recip = -1.0 / approx; VecType approximation = select(recip, approx, poly_mask); return approximation ^ sign; }