v8sf exp256_ps(v8sf x) { v8sf tmp = _mm256_setzero_ps(), fx; v8si imm0; v8sf one = *(v8sf*)_ps256_1; x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi); x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo); /* express exp(x) as exp(g + n*log(2)) */ fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF); fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5); /* how to perform a floorf with SSE: just below */ //imm0 = _mm256_cvttps_epi32(fx); //tmp = _mm256_cvtepi32_ps(imm0); tmp = _mm256_floor_ps(fx); /* if greater, substract 1 */ //v8sf mask = _mm256_cmpgt_ps(tmp, fx); v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); mask = _mm256_and_ps(mask, one); fx = _mm256_sub_ps(tmp, mask); tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1); v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2); x = _mm256_sub_ps(x, tmp); x = _mm256_sub_ps(x, z); z = _mm256_mul_ps(x,x); v8sf y = *(v8sf*)_ps256_cephes_exp_p0; y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, x); y = _mm256_add_ps(y, one); /* build 2^n */ imm0 = _mm256_cvttps_epi32(fx); // another two AVX2 instructions imm0 = _mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f); imm0 = _mm256_slli_epi32(imm0, 23); v8sf pow2n = _mm256_castsi256_ps(imm0); y = _mm256_mul_ps(y, pow2n); return y; }
void extern avx512vl_test (void) { x1 = _mm256_roundscale_ps (x1, 0x42); x1 = _mm256_ceil_ps (x1); x1 = _mm256_floor_ps (x1); x1 = _mm256_mask_roundscale_ps (x1, 2, x1, 0x42); x1 = _mm256_maskz_roundscale_ps (2, x1, 0x42); x2 = _mm_roundscale_ps (x2, 0x42); x2 = _mm_ceil_ps (x2); x2 = _mm_floor_ps (x2); x2 = _mm_mask_roundscale_ps (x2, 2, x2, 0x42); x2 = _mm_maskz_roundscale_ps (2, x2, 0x42); }
inline __m256i avx2_positive_hexid_to_ringid_root(const __m256i hexid) { // The following algorithm works until hexid=12,589,056 // const unsigned iarg = 1+4*(hexid-1)/3; // return (unsigned(std::sqrt(float(iarg)))+1)/2; __m256 arg = _mm256_cvtepi32_ps(hexid); arg = _mm256_fmsub_ps(arg, calin::math::simd::c_m256(_c_m256_four_thirds), calin::math::simd::c_m256(_c_m256_one_third)); arg = _mm256_sqrt_ps(arg); arg = _mm256_fmadd_ps(arg, calin::math::simd::c_m256(_c_m256_one_half), calin::math::simd::c_m256(_c_m256_one_half)); arg = _mm256_floor_ps(arg); return _mm256_cvtps_epi32(arg); }