static void __stdcall comb_mask_1_simd(uint8_t* dstp, const uint8_t* srcp, const int dpitch, const int spitch, const int cthresh, const int width, const int height) noexcept { const uint8_t* sc = srcp; const uint8_t* sb = sc + spitch; const uint8_t* sd = sc + spitch; const V cth = set1_i16<V>(static_cast<int16_t>(cthresh)); const V all = cmpeq_i8(cth, cth); constexpr int step = sizeof(V) / 2; for (int y = 0; y < height; ++y) { for (int x = 0; x < width; x += step) { V xb = load_half<V>(sb + x); V xc = load_half<V>(sc + x); V xd = load_half<V>(sd + x); xb = sub_i16(xb, xc); xd = sub_i16(xd, xc); xc = andnot(mulhi(xb, xd), mullo(xb, xd)); xc = cmpgt_u16(xc, cth, all); store_half(dstp + x, xc); } sb = sc; sc = sd; sd += (y < height - 2) ? spitch : -spitch; dstp += dpitch; } }
always_inline VecFloat ldexp_float(VecFloat const & x, typename VecFloat::int_vec const & n) { typedef typename VecFloat::int_vec int_vec; const VecFloat exponent_mask = VecFloat::gen_exp_mask(); const VecFloat exponent = exponent_mask & x; const VecFloat x_wo_x = andnot(exponent_mask, x); // clear exponent int_vec new_exp = slli(n, 16+7) + int_vec(exponent); // new exponent VecFloat new_exp_float(new_exp); VecFloat ret = x_wo_x | new_exp_float; return ret; }
always_inline VecFloat frexp_float(VecFloat const & x, typename VecFloat::int_vec & exp) { typedef typename VecFloat::int_vec int_vec; const VecFloat exponent_mask = VecFloat::gen_exp_mask(); const VecFloat exponent = exponent_mask & x; const VecFloat x_wo_x = andnot(exponent_mask, x); // clear exponent const int_vec exp_int(exponent); exp = srli(exp_int, 16+7) - int_vec(126); return x_wo_x | VecFloat::gen_exp_mask_1(); }
always_inline VecType vec_cos_float(VecType const & arg) { typedef typename VecType::int_vec int_vec; const typename VecType::float_type four_over_pi = 1.27323954473516268615107010698011489627567716592367; VecType abs_arg = arg & VecType::gen_abs_mask(); VecType y = abs_arg * four_over_pi; int_vec j = y.truncate_to_int(); /* cephes: j=(j+1) & (~1) */ j = (j + int_vec(1)) & int_vec(~1); y = j.convert_to_float(); /* sign based on quadrant */ int_vec jm2 = j - int_vec(2); VecType sign = slli(andnot(jm2, int_vec(4)), 29); /* polynomial mask */ VecType poly_mask = VecType (mask_eq(jm2 & int_vec(2), int_vec(0))); /* black magic */ static float DP1 = 0.78515625; static float DP2 = 2.4187564849853515625e-4; static float DP3 = 3.77489497744594108e-8; VecType base = ((abs_arg - y * DP1) - y * DP2) - y * DP3; /* [0..pi/4] */ VecType z = base * base; VecType p1 = (( 2.443315711809948E-005 * z - 1.388731625493765E-003) * z + 4.166664568298827E-002) * z * z -0.5f * z + 1 ; /* [pi/4..pi/2] */ VecType p2 = ((-1.9515295891E-4 * z + 8.3321608736E-3) * z - 1.6666654611E-1) * z * base + base; VecType approximation = select(p1, p2, poly_mask); return approximation ^ sign; }
always_inline VecType vec_select(VecType lhs, VecType rhs, VecType bitmask) { return andnot(bitmask, lhs) | (bitmask & rhs); }
always_inline VecType vec_select(VecType lhs, VecType rhs, VecType bitmask) { const VecType result = andnot(bitmask, lhs) | (bitmask & rhs); return result; }