always_inline VecType vec_atan_float(VecType const & arg) { const VecType sign_arg = arg & VecType::gen_sign_mask(); const VecType abs_arg = arg & VecType::gen_abs_mask(); const VecType one = VecType::gen_one(); VecType zero = VecType::gen_zero(); VecType arg_range0 = abs_arg; VecType arg_range1 = (abs_arg - one) / (abs_arg + one); VecType arg_range2 = -one / abs_arg; VecType offset_range0 = zero; VecType offset_range1 = 0.78539816339744830961566084581987572104929234984377; VecType offset_range2 = 1.57079632679489661923132169163975144209858469968754; VecType mask_range_01 = mask_gt(abs_arg, 0.41421356237309504880168872420969807856967187537695); VecType mask_range_12 = mask_gt(abs_arg, 2.41421356237309504880168872420969807856967187537698); VecType approx_arg = select(arg_range0, select(arg_range1, arg_range2, mask_range_12), mask_range_01); VecType approx_offset = select(offset_range0, select(offset_range1, offset_range2, mask_range_12), mask_range_01); VecType x = approx_arg; VecType x2 = x*x; VecType approx = approx_offset + x + x * x2 * (-0.333329498767852783203125 + x2 * (0.19977732002735137939453125 + x2 * (-0.1387787759304046630859375 + x2 * 8.054284751415252685546875e-2))); return approx ^ sign_arg; }
always_inline VecType vec_acos_float(VecType const & arg) { VecType abs_arg = arg & VecType::gen_abs_mask(); VecType one = VecType::gen_one(); VecType half = VecType::gen_05(); VecType zero = VecType::gen_zero(); VecType arg_greater_05 = mask_gt(abs_arg, half); VecType asin_arg_greater_05 = sqrt((one - abs_arg) * half); VecType asin_arg = select(arg, asin_arg_greater_05, arg_greater_05); VecType asin = vec_asin_float(asin_arg); VecType two_asin = asin + asin; VecType ret_m1_m05 = 3.1415927410125732421875 - two_asin; VecType ret_m05_05 = 1.57079637050628662109375 - asin; VecType ret_05_1 = two_asin; VecType ret_m05_1 = select(ret_m05_05, ret_05_1, mask_gt(arg, half)); VecType ret = select(ret_m1_m05, ret_m05_1, mask_gt(arg, -0.5)); // |arg| > 1: return 0 ret = select(ret, zero, mask_gt(abs_arg, one)); return ret; }
always_inline VecType vec_asin_float(VecType const & arg) { VecType abs_arg = arg & VecType::gen_abs_mask(); VecType sign = arg & VecType::gen_sign_mask(); VecType one = VecType::gen_one(); VecType half = VecType::gen_05(); VecType zero = VecType::gen_zero(); // range redution: asin(x) = pi/2 - 2 asin( sqrt( (1-x)/2 ) ). for |arg| > 0.5 VecType arg_greater_05 = mask_gt(abs_arg, 0.5); VecType arg_reduced_sqr = (one - abs_arg) * half; VecType arg_reduced = sqrt((one - abs_arg) * half); VecType approx_arg = select(abs_arg, arg_reduced, arg_greater_05); VecType z = select(abs_arg*abs_arg, arg_reduced_sqr, arg_greater_05); VecType x = approx_arg; VecType x2 = x*x; // sollya: fpminimax(asin(x), [|3,5,7,9,11|], [|24...|], [0.000000000000000000001,0.5], x); VecType approx_poly = x + x * x2 * (0.166667520999908447265625 + x2 * (7.4953101575374603271484375e-2 + x2 * (4.54690195620059967041015625e-2 + x2 * (2.418550290167331695556640625e-2 + x2 * 4.21570129692554473876953125e-2)))); VecType approx_poly_reduced = 1.57079637050628662109375 - approx_poly - approx_poly; VecType approx = select(approx_poly, approx_poly_reduced, arg_greater_05); approx = approx ^ sign; // |arg| > 1: return 0 VecType ret = select(approx, zero, mask_gt(abs_arg, one)); return ret; }
always_inline VecType vec_floor_float(VecType const & arg) { typedef VecType vec; const vec rounded = vec_round_float(arg); const vec rounded_larger = mask_gt(rounded, arg); const vec add = rounded_larger & vec::gen_one(); return rounded - add; }
always_inline VecType vec_tanh_float(VecType const & arg) { /* this order of computation (large->small->medium) seems to be the most efficient on sse*/ const VecType sign_arg = arg & VecType::gen_sign_mask(); const VecType abs_arg = arg ^ sign_arg; const VecType one = VecType::gen_one(); const VecType two (2.f); const VecType maxlogf_2 (22.f); const VecType limit_small (0.625f); /* large values */ const VecType abs_big = mask_gt(abs_arg, maxlogf_2); const VecType result_limit_abs = one; /* small values */ const VecType f1((float)-5.70498872745e-3); const VecType f2((float) 2.06390887954e-2); const VecType f3((float)-5.37397155531e-2); const VecType f4((float) 1.33314422036e-1); const VecType f5((float)-3.33332819422e-1); const VecType arg_sqr = abs_arg * abs_arg; const VecType result_small = ((((f1 * arg_sqr + f2) * arg_sqr + f3) * arg_sqr + f4) * arg_sqr + f5) * arg_sqr * arg + arg; const VecType abs_small = mask_lt(abs_arg, limit_small); /* medium values */ const VecType result_medium_abs = one - two / (vec_exp_tanh_float(abs_arg + abs_arg) + one); /* select from large and medium branches and set sign */ const VecType result_lm_abs = select(result_medium_abs, result_limit_abs, abs_big); const VecType result_lm = result_lm_abs | sign_arg; const VecType result = select(result_lm, result_small, abs_small); return result; }
always_inline VecType vec_exp_float(VecType const & arg) { typedef typename VecType::int_vec int_vec; /* Express e**x = e**g 2**n * = e**g e**( n loge(2) ) * = e**( g + n loge(2) ) */ // black magic VecType x = arg; VecType z = round(VecType(1.44269504088896341f) * x); int_vec n = z.truncate_to_int(); x -= z*VecType(0.693359375f); x -= z*VecType(-2.12194440e-4f); /* Theoretical peak relative error in [-0.5, +0.5] is 3.5e-8. */ VecType p = VecType(VecType::gen_one()) + x * (1.00000035762786865234375f + x * (0.4999996721744537353515625f + x * (0.16665561497211456298828125f + x * (4.167006909847259521484375e-2f + x * (8.420792408287525177001953125e-3f + x * 1.386119984090328216552734375e-3f))))); /* multiply by power of 2 */ VecType approx = ldexp_float(p, n); /* handle min/max boundaries */ const VecType maxlogf(88.72283905206835f); // const VecType minlogf(-103.278929903431851103f); const VecType minlogf = -maxlogf; const VecType max_float(std::numeric_limits<float>::max()); const VecType zero = VecType::gen_zero(); VecType too_large = mask_gt(arg, maxlogf); VecType too_small = mask_lt(arg, minlogf); VecType ret = select(approx, max_float, too_large); ret = select(ret, zero, too_small); return ret; }