__m256 mm256_exp_ps(__m256 x) { __m256 tmp = _mm256_setzero_ps(), fx; __m256i emm0; __m256 one = *(__m256*)m256_ps_1; x = _mm256_min_ps(x, *(__m256*)m256_ps_exp_hi); x = _mm256_max_ps(x, *(__m256*)m256_ps_exp_lo); /* express exp(x) as exp(g + n*log(2)) */ fx = _mm256_mul_ps(x, *(__m256*)m256_ps_cephes_LOG2EF); fx = _mm256_add_ps(fx, *(__m256*)m256_ps_0p5); /* how to perform a floorf with SSE: just below */ /* step 1 : cast to int */ emm0 = _mm256_cvttps_epi32(fx); /* step 2 : cast back to float */ tmp = _mm256_cvtepi32_ps(emm0); /* if greater, substract 1 */ __m256 mask = _mm256_cmp_ps( tmp, fx, _CMP_GT_OS ); mask = _mm256_and_ps(mask, one); fx = _mm256_sub_ps(tmp, mask); tmp = _mm256_mul_ps(fx, *(__m256*)m256_ps_cephes_exp_C1); __m256 z = _mm256_mul_ps(fx, *(__m256*)m256_ps_cephes_exp_C2); x = _mm256_sub_ps(x, tmp); x = _mm256_sub_ps(x, z); z = _mm256_mul_ps(x,x); __m256 y = *(__m256*)m256_ps_cephes_exp_p0; y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p1); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p2); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p3); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p4); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p5); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, x); y = _mm256_add_ps(y, one); /* build 2^n */ emm0 = _mm256_cvttps_epi32(fx); emm0 = _mm256_add_epi32(emm0, *(__m256i*)m256_pi32_0x7f); emm0 = _mm256_slli_epi32(emm0, 23); __m256 pow2n = _mm256_castsi256_ps(emm0); y = _mm256_mul_ps(y, pow2n); _mm256_zeroupper(); return y; }
v8sf exp256_ps(v8sf x) { v8sf tmp = _mm256_setzero_ps(), fx; v8si imm0; v8sf one = *(v8sf*)_ps256_1; x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi); x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo); /* express exp(x) as exp(g + n*log(2)) */ fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF); fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5); /* how to perform a floorf with SSE: just below */ //imm0 = _mm256_cvttps_epi32(fx); //tmp = _mm256_cvtepi32_ps(imm0); tmp = _mm256_floor_ps(fx); /* if greater, substract 1 */ //v8sf mask = _mm256_cmpgt_ps(tmp, fx); v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); mask = _mm256_and_ps(mask, one); fx = _mm256_sub_ps(tmp, mask); tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1); v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2); x = _mm256_sub_ps(x, tmp); x = _mm256_sub_ps(x, z); z = _mm256_mul_ps(x,x); v8sf y = *(v8sf*)_ps256_cephes_exp_p0; y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, x); y = _mm256_add_ps(y, one); /* build 2^n */ imm0 = _mm256_cvttps_epi32(fx); // another two AVX2 instructions imm0 = _mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f); imm0 = _mm256_slli_epi32(imm0, 23); v8sf pow2n = _mm256_castsi256_ps(imm0); y = _mm256_mul_ps(y, pow2n); return y; }
inline avx_m256_t newsin_ps(avx_m256_t x) { avx_m256_t sign_bit = _mm256_and_ps(x, _ps_sign_mask); x = _mm256_and_ps(x, _ps_inv_sign_mask); avx_m256_t y = _mm256_mul_ps(x, _ps_cephes_FOPI); avx_m256i_t emm2 = _mm256_cvttps_epi32(y); emm2 = _mm256_add_epi32(emm2, _pi32_1); emm2 = _mm256_and_si256(emm2, _pi32_inv1); y = _mm256_cvtepi32_ps(emm2); avx_m256i_t emm0 = _mm256_and_si256(emm2, _pi32_4); emm0 = _mm256_slli_epi32(emm0, 29); emm2 = _mm256_and_si256(emm2, _pi32_2); emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256()); avx_m256_t swap_sign_bit = _mm256_castsi256_ps(emm0); avx_m256_t poly_mask = _mm256_castsi256_ps(emm2); sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit); avx_m256_t temp = _ps_minus_cephes_DP123; temp = _mm256_mul_ps(y, temp); x = _mm256_add_ps(x, temp); avx_m256_t x2 = _mm256_mul_ps(x, x); avx_m256_t x3 = _mm256_mul_ps(x2, x); avx_m256_t x4 = _mm256_mul_ps(x2, x2); y = _ps_coscof_p0; avx_m256_t y2 = _ps_sincof_p0; y = _mm256_mul_ps(y, x2); y2 = _mm256_mul_ps(y2, x2); y = _mm256_add_ps(y, _ps_coscof_p1); y2 = _mm256_add_ps(y2, _ps_sincof_p1); y = _mm256_mul_ps(y, x2); y2 = _mm256_mul_ps(y2, x2); y = _mm256_add_ps(y, _ps_coscof_p2); y2 = _mm256_add_ps(y2, _ps_sincof_p2); y = _mm256_mul_ps(y, x4); y2 = _mm256_mul_ps(y2, x3); temp = _mm256_mul_ps(x2, _ps_0p5); temp = _mm256_sub_ps(temp, _ps_1); y = _mm256_sub_ps(y, temp); y2 = _mm256_add_ps(y2, x); y = _mm256_andnot_ps(poly_mask, y); y2 = _mm256_and_ps(poly_mask, y2); y = _mm256_add_ps(y, y2); y = _mm256_xor_ps(y, sign_bit); return y; } // newsin_ps()
void static avx_test (void) { int i; union256 s1; union256i_d u; int e [8]; s1.x = _mm256_set_ps (45.64, 4564.56, 2.3, 5.5, 57.57, 89.34, 54.12, 954.67); u.x = _mm256_cvttps_epi32 (s1.x); for (i = 0; i < 8; i++) e[i] = (int)s1.a[i]; if (check_union256i_d (u, e)) abort (); }
__m256 _inner_mm256_exp_ps1(__m256 arg) { arg = _mm256_mul_ps(arg, _mm256_set1_ps(1.4426950408889634073599246810018921374266459541529859f)); __m256i e = _mm256_add_epi32( _mm256_castps_si256(_mm256_cmp_ps(arg, _mm256_set1_ps(0.0f), _CMP_LT_OQ)), _mm256_cvttps_epi32(arg)); arg = _mm256_sub_ps(arg, _mm256_cvtepi32_ps(e)); __m256 intermediate_result; intermediate_result = _mm256_fmadd_ps(_mm256_set1_ps(0.0136779459179717f), arg, _mm256_set1_ps(0.0517692205767896f)); intermediate_result = _mm256_fmadd_ps(intermediate_result, arg, _mm256_set1_ps(0.241554388295527f)); intermediate_result = _mm256_fmadd_ps(intermediate_result, arg, _mm256_set1_ps(0.692998430056128f)); intermediate_result = _mm256_fmadd_ps(intermediate_result, arg, _mm256_set1_ps(0.999999804292074f)); arg = intermediate_result; __m256 res = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_add_epi32(e, _mm256_set1_epi32(127)), 23)); res = _mm256_mul_ps(res, arg); return res; }
inline void Sort4Deg6(__m256 llrI, int pos[], int ipos[]) { int llr[8] __attribute__((aligned(64))); const auto v1 = _mm256_set1_ps( 67108864.0f ); const auto v2 = _mm256_mul_ps( v1, llrI ); _mm256_store_si256((__m256i *)llr, _mm256_cvttps_epi32(v2)); //register float x0,x1,x2,x3,x4,x5; const auto x0 = llr[0]; const auto x1 = llr[1]; const auto x2 = llr[2]; const auto x3 = llr[3]; const auto x4 = llr[4]; const auto x5 = llr[5]; int o0 = (x0<x1) +(x0<x2)+(x0<x3)+(x0<x4)+(x0<x5); int o1 = (x1<=x0)+(x1<x2)+(x1<x3)+(x1<x4)+(x1<x5); int o2 = (x2<=x0)+(x2<=x1)+(x2<x3)+(x2<x4)+(x2<x5); int o3 = (x3<=x0)+(x3<=x1)+(x3<=x2)+(x3<x4)+(x3<x5); int o4 = (x4<=x0)+(x4<=x1)+(x4<=x2)+(x4<=x3)+(x4<x5); int o5 = 15-(o0+o1+o2+o3+o4); pos[o0] = 0; pos[o1]= 1; pos[o2]= 2; pos[o3]= 3; pos[o4]= 4; pos[o5]= 5; pos[6]=6; pos[7]=7; ipos[ 0] = o0; ipos[ 1]=o1; ipos[ 2]=o2; ipos[ 3]=o3; ipos[ 4]=o4; ipos[ 5]=o5; ipos[6]=6; ipos[7]=7; }
{ template<class Sig> struct result; template<class This,class A0> struct result<This(A0)> { typedef typename meta::as_integer<A0>::type type; }; NT2_FUNCTOR_CALL_DISPATCH( 1, typename nt2::meta::scalar_of<A0>::type, (3, (float,double,arithmetic_)) ) NT2_FUNCTOR_CALL_EVAL_IF(1, float) { typedef typename NT2_CALL_RETURN_TYPE(1)::type type; type that = {_mm256_cvttps_epi32(a0)}; return that; } NT2_FUNCTOR_CALL_EVAL_IF(1, double) { typedef typename NT2_CALL_RETURN_TYPE(1)::type type; const type v = {{a0[0],a0[1], a0[2],a0[3]}}; //TODO with _mm_cvttpd_epi32 return v; } NT2_FUNCTOR_CALL_EVAL_IF(1, arithmetic_) { return a0; } }; } }
void run_dct(int width, int height, float *quant, float *input, int32_t *output) { float acosvals[8][8]; /* Calculating cosines is expensive, and there * are only 64 cosines that need to be calculated * so precompute them and cache. */ for (int i = 0; i < 8; i++) { for (int j = 0; j < 8; j++) { if (j == 0) { acosvals[i][j] = sqrt(1.0 / 8.0) * cos(PI / 8.0 * (i + 0.5d) * j); } else { acosvals[i][j] = 0.5 * cos(PI / 8.0 * (i + 0.5d) * j); } } } /* Separate the parallel from the for, so each processor gets its * own copy of the buffers and variables. */ #pragma omp parallel { float avload[8] = {0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5}; avload[0] = sqrt(1.0 / 8.0); __m256 row0, row1, row2, row3, row4, row5, row6, row7; __m256 loaderlow, loaderhigh; __m256 temp; __m256 minus128 = _mm256_set1_ps(-128.0); __m256 avxcosloader, avxcos; float avxcosmover; __m256i integer; /* The DCT breaks the image into 8 by 8 blocks and then * transforms them into color frequencies. */ #pragma omp for for (int brow = 0; brow < height / 8; brow++) { for (int bcol = 0; bcol < width / 8; bcol++) { int head_pointer = bcol * 8 + brow * 8 * width; row0 = _mm256_setzero_ps(); row1 = _mm256_setzero_ps(); row2 = _mm256_setzero_ps(); row3 = _mm256_setzero_ps(); row4 = _mm256_setzero_ps(); row5 = _mm256_setzero_ps(); row6 = _mm256_setzero_ps(); row7 = _mm256_setzero_ps(); /* This pair of loops uses AVX instuctions to add the frequency * component from each pixel to all of the buckets at once. Allows * us to do the DCT on a block in 64 iterations of a loop rather * than 64 iterations of 64 iterations of a loop (all 64 pixels affect * all 64 frequencies) */ for (int x = 0; x < 8; x++) { for (int y = 0; y < 4; y++) { loaderlow = _mm256_broadcast_ss(&input[head_pointer + x + (y * width)]); loaderlow = _mm256_add_ps(loaderlow, minus128); loaderhigh = _mm256_broadcast_ss(&input[head_pointer + x + ((7 - y) * width)]); loaderhigh = _mm256_add_ps(loaderhigh, minus128); avxcos = _mm256_loadu_ps(&acosvals[x][0]); loaderlow = _mm256_mul_ps(loaderlow, avxcos); loaderhigh = _mm256_mul_ps(loaderhigh, avxcos); avxcosloader = _mm256_loadu_ps(&acosvals[y][0]); avxcosmover = avxcosloader[0]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row0 = _mm256_add_ps(row0, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row0 = _mm256_add_ps(row0, temp); avxcosmover = avxcosloader[1]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row1 = _mm256_add_ps(row1, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row1 = _mm256_sub_ps(row1, temp); avxcosmover = avxcosloader[2]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row2 = _mm256_add_ps(row2, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row2 = _mm256_add_ps(row2, temp); avxcosmover = avxcosloader[3]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row3 = _mm256_add_ps(row3, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row3 = _mm256_sub_ps(row3, temp); avxcosmover = avxcosloader[4]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row4 = _mm256_add_ps(row4, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row4 = _mm256_add_ps(row4, temp); avxcosmover = avxcosloader[5]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row5 = _mm256_add_ps(row5, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row5 = _mm256_sub_ps(row5, temp); avxcosmover = avxcosloader[6]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row6 = _mm256_add_ps(row6, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row6 = _mm256_add_ps(row6, temp); avxcosmover = avxcosloader[7]; avxcos = _mm256_set1_ps(avxcosmover); temp = _mm256_mul_ps(loaderlow, avxcos); row7 = _mm256_add_ps(row7, temp); temp = _mm256_mul_ps(loaderhigh, avxcos); row7 = _mm256_sub_ps(row7, temp); } } /* Each frequency stored as a float needs to be divided by * the quantization value, then rounded to the nearest integer. * Also changes the order of the values from pixel order to * each 8 by 8 block stored one after another. */ temp = _mm256_loadu_ps(&quant[0]); row0 = _mm256_div_ps(row0, temp); row0 = _mm256_round_ps(row0, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row0); _mm256_storeu_si256(output + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[8]); row1 = _mm256_div_ps(row1, temp); row1 = _mm256_round_ps(row1, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row1); _mm256_storeu_si256(output + 8 + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[16]); row2 = _mm256_div_ps(row2, temp); row2 = _mm256_round_ps(row2, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row2); _mm256_storeu_si256(output + 16 + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[24]); row3 = _mm256_div_ps(row3, temp); row3 = _mm256_round_ps(row3, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row3); _mm256_storeu_si256(output + 24 + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[32]); row4 = _mm256_div_ps(row4, temp); row4 = _mm256_round_ps(row4, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row4); _mm256_storeu_si256(output + 32 + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[40]); row5 = _mm256_div_ps(row5, temp); row5 = _mm256_round_ps(row5, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row5); _mm256_storeu_si256(output + 40 + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[48]); row6 = _mm256_div_ps(row6, temp); row6 = _mm256_round_ps(row6, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row6); _mm256_storeu_si256(output + 48 + (bcol + brow * (width / 8)) * 64, integer); temp = _mm256_loadu_ps(&quant[56]); row7 = _mm256_div_ps(row7, temp); row7 = _mm256_round_ps(row7, _MM_FROUND_TO_NEAREST_INT); integer = _mm256_cvttps_epi32(row7); _mm256_storeu_si256(output + 56 + (bcol + brow * (width / 8)) * 64, integer); } } } }
inline avx_m256_t newexp_ps(avx_m256_t x) { avx_m256_t one = _ps_1; avx_m256_t zero = _ps_0; x = _mm256_min_ps(x, _ps_exp_hi); x = _mm256_max_ps(x, _ps_exp_lo); avx_m256_t temp_2 = _mm256_mul_ps(x, _ps_cephes_LOG2EF); temp_2 = _mm256_add_ps(temp_2, _ps_0p5); avx_m256i_t emm0 = _mm256_cvttps_epi32(temp_2); avx_m256_t temp_1 = _mm256_cvtepi32_ps(emm0); avx_m256_t temp_3 = _mm256_sub_ps(temp_1, temp_2); avx_m256_t mask = _mm256_cmp_ps(temp_3, zero, _CMP_GT_OQ); mask = _mm256_and_ps(mask, one); temp_2 = _mm256_sub_ps(temp_1, mask); emm0 = _mm256_cvttps_epi32(temp_2); temp_1 = _mm256_mul_ps(temp_2, _ps_cephes_exp_C12); x = _mm256_sub_ps(x, temp_1); avx_m256_t x2 = _mm256_mul_ps(x, x); avx_m256_t x3 = _mm256_mul_ps(x2, x); avx_m256_t x4 = _mm256_mul_ps(x2, x2); temp_1 = _mm256_add_ps(x, one); temp_2 = _mm256_mul_ps(x2, _ps_cephes_exp_p5); temp_3 = _mm256_mul_ps(x3, _ps_cephes_exp_p4); temp_1 = _mm256_add_ps(temp_1, temp_2); temp_2 = _mm256_mul_ps(x3, _ps_cephes_exp_p0); temp_1 = _mm256_add_ps(temp_1, temp_3); avx_m256_t temp_4 = _mm256_mul_ps(x, _ps_cephes_exp_p2); temp_3 = _mm256_mul_ps(x2, _ps_cephes_exp_p1); emm0 = _mm256_castps_si256(_mm256_add_ps(_mm256_castsi256_ps(emm0), _mm256_castsi256_ps(_pi32_0x7f))); temp_2 = _mm256_add_ps(temp_2, temp_3); temp_3 = _mm256_add_ps(temp_3, temp_4); //emm0 = _mm256_slli_epi32(emm0, 23); // convert emm0 into two 128-bit integer vectors // perform shift on both vectors // combine both vectors into 256-bit emm0 __m128i emm0hi = _mm256_extractf128_si256(emm0, 0); __m128i emm0lo = _mm256_extractf128_si256(emm0, 1); emm0hi = _mm_slli_epi32(emm0hi, 23); emm0lo = _mm_slli_epi32(emm0lo, 23); emm0 = _mm256_insertf128_si256(emm0, emm0hi, 0); emm0 = _mm256_insertf128_si256(emm0, emm0lo, 1); avx_m256_t pow2n = _mm256_castsi256_ps(emm0); temp_2 = _mm256_add_ps(temp_2, temp_3); temp_2 = _mm256_mul_ps(temp_2, x4); avx_m256_t y = _mm256_add_ps(temp_1, temp_2); y = _mm256_mul_ps(y, pow2n); return y; } // newexp_ps()
inline void newsincos_ps_dual(avx_m256_t x1, avx_m256_t x2, avx_m256_t *s1, avx_m256_t *s2, avx_m256_t *c1, avx_m256_t *c2) { avx_m256_t tempa = _ps_sign_mask; avx_m256_t tempb = _ps_inv_sign_mask; avx_m256_t sign_bit1 = _mm256_and_ps(x1, tempa); avx_m256_t sign_bit2 = _mm256_and_ps(x2, tempa); x1 = _mm256_and_ps(x1, tempb); x2 = _mm256_and_ps(x2, tempb); tempa = _ps_cephes_FOPI; avx_m256_t y1 = _mm256_mul_ps(x1, tempa); avx_m256_t y2 = _mm256_mul_ps(x2, tempa); //avx_m256i_t emm21 = _mm256_cvttps_epi32(y1); //avx_m256i_t emm22 = _mm256_cvttps_epi32(y2); //emm21 = _mm256_add_epi32(emm21, _pi32_1); //emm22 = _mm256_add_epi32(emm22, _pi32_1); avx_m256i_t emm21 = _mm256_cvttps_epi32(_mm256_add_ps(y1, _ps_1)); avx_m256i_t emm22 = _mm256_cvttps_epi32(_mm256_add_ps(y2, _ps_1)); //emm21 = _mm256_and_si256(emm21, _pi32_inv1); //emm22 = _mm256_and_si256(emm22, _pi32_inv1); emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21), _mm256_castsi256_ps(_pi32_inv1))); emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22), _mm256_castsi256_ps(_pi32_inv1))); y1 = _mm256_cvtepi32_ps(emm21); y2 = _mm256_cvtepi32_ps(emm22); //avx_m256i_t tempia = _pi32_2; //avx_m256i_t cos_emm21 = _mm256_sub_epi32(emm21, tempia); //avx_m256i_t cos_emm22 = _mm256_sub_epi32(emm22, tempia); avx_m256i_t cos_emm21 = _mm256_cvtps_epi32(_mm256_sub_ps(_mm256_cvtepi32_ps(emm21), _ps_2)); avx_m256i_t cos_emm22 = _mm256_cvtps_epi32(_mm256_sub_ps(_mm256_cvtepi32_ps(emm22), _ps_2)); //avx_m256i_t tempib = _pi32_4; //avx_m256i_t emm01 = _mm256_and_si256(emm21, tempib); //avx_m256i_t emm02 = _mm256_and_si256(emm22, tempib); avx_m256i_t emm01 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21), _mm256_castsi256_ps(_pi32_4))); avx_m256i_t emm02 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22), _mm256_castsi256_ps(_pi32_4))); //avx_m256i_t cos_emm01 = _mm256_andnot_si256(cos_emm21, tempib); //avx_m256i_t cos_emm02 = _mm256_andnot_si256(cos_emm22, tempib); avx_m256i_t cos_emm01 = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(cos_emm21), _mm256_castsi256_ps(_pi32_4))); avx_m256i_t cos_emm02 = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(cos_emm22), _mm256_castsi256_ps(_pi32_4))); //emm01 = _mm256_slli_epi32(emm01, 29); __m128i emm0hi1 = _mm256_extractf128_si256(emm01, 0); __m128i emm0lo1 = _mm256_extractf128_si256(emm01, 1); emm0hi1 = _mm_slli_epi32(emm0hi1, 29); emm0lo1 = _mm_slli_epi32(emm0lo1, 29); emm01 = _mm256_insertf128_si256(emm01, emm0hi1, 0); emm01 = _mm256_insertf128_si256(emm01, emm0lo1, 1); //emm02 = _mm256_slli_epi32(emm02, 29); __m128i emm0hi2 = _mm256_extractf128_si256(emm02, 0); __m128i emm0lo2 = _mm256_extractf128_si256(emm02, 1); emm0hi2 = _mm_slli_epi32(emm0hi2, 29); emm0lo2 = _mm_slli_epi32(emm0lo2, 29); emm02 = _mm256_insertf128_si256(emm02, emm0hi1, 0); emm02 = _mm256_insertf128_si256(emm02, emm0lo1, 1); //cos_emm01 = _mm256_slli_epi32(cos_emm01, 29); __m128i cos_emm0hi1 = _mm256_extractf128_si256(cos_emm01, 0); __m128i cos_emm0lo1 = _mm256_extractf128_si256(cos_emm01, 1); cos_emm0hi1 = _mm_slli_epi32(cos_emm0hi1, 29); cos_emm0lo1 = _mm_slli_epi32(cos_emm0lo1, 29); cos_emm01 = _mm256_insertf128_si256(cos_emm01, cos_emm0hi1, 0); cos_emm01 = _mm256_insertf128_si256(cos_emm01, cos_emm0lo1, 1); //cos_emm02 = _mm256_slli_epi32(cos_emm02, 29); __m128i cos_emm0hi2 = _mm256_extractf128_si256(cos_emm02, 0); __m128i cos_emm0lo2 = _mm256_extractf128_si256(cos_emm02, 1); cos_emm0hi2 = _mm_slli_epi32(cos_emm0hi2, 29); cos_emm0lo2 = _mm_slli_epi32(cos_emm0lo2, 29); cos_emm02 = _mm256_insertf128_si256(cos_emm02, cos_emm0hi2, 0); cos_emm02 = _mm256_insertf128_si256(cos_emm02, cos_emm0lo2, 1); //tempia = _pi32_2; //tempib = _mm256_setzero_si256(); //emm21 = _mm256_and_si256(emm21, tempia); //emm22 = _mm256_and_si256(emm22, tempia); emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21), _mm256_castsi256_ps(_pi32_2))); emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22), _mm256_castsi256_ps(_pi32_2))); //cos_emm21 = _mm256_and_si256(cos_emm21, tempia); //cos_emm22 = _mm256_and_si256(cos_emm22, tempia); cos_emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(cos_emm21), _mm256_castsi256_ps(_pi32_2))); cos_emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(cos_emm22), _mm256_castsi256_ps(_pi32_2))); //emm21 = _mm256_cmpeq_epi32(emm21, tempib); //emm22 = _mm256_cmpeq_epi32(emm22, tempib); emm21 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(emm21), _mm256_setzero_ps(), _CMP_EQ_UQ)); emm22 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(emm22), _mm256_setzero_ps(), _CMP_EQ_UQ)); //cos_emm21 = _mm256_cmpeq_epi32(cos_emm21, tempib); //cos_emm22 = _mm256_cmpeq_epi32(cos_emm22, tempib); cos_emm21 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(cos_emm21), _mm256_setzero_ps(), _CMP_EQ_UQ)); cos_emm22 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(cos_emm22), _mm256_setzero_ps(), _CMP_EQ_UQ)); avx_m256_t emm0f1 = _mm256_castsi256_ps(emm01); avx_m256_t emm0f2 = _mm256_castsi256_ps(emm02); avx_m256_t emm2f1 = _mm256_castsi256_ps(emm21); avx_m256_t emm2f2 = _mm256_castsi256_ps(emm22); avx_m256_t cos_emm0f1 = _mm256_castsi256_ps(cos_emm01); avx_m256_t cos_emm0f2 = _mm256_castsi256_ps(cos_emm02); avx_m256_t cos_emm2f1 = _mm256_castsi256_ps(cos_emm21); avx_m256_t cos_emm2f2 = _mm256_castsi256_ps(cos_emm22); sign_bit1 = _mm256_xor_ps(sign_bit1, emm0f1); sign_bit2 = _mm256_xor_ps(sign_bit2, emm0f2); tempa = _ps_minus_cephes_DP123; tempb = _mm256_mul_ps(y2, tempa); tempa = _mm256_mul_ps(y1, tempa); x2 = _mm256_add_ps(x2, tempb); x1 = _mm256_add_ps(x1, tempa); avx_m256_t x21 = _mm256_mul_ps(x1, x1); avx_m256_t x22 = _mm256_mul_ps(x2, x2); avx_m256_t x31 = _mm256_mul_ps(x21, x1); avx_m256_t x32 = _mm256_mul_ps(x22, x2); avx_m256_t x41 = _mm256_mul_ps(x21, x21); avx_m256_t x42 = _mm256_mul_ps(x22, x22); tempa = _ps_coscof_p0; tempb = _ps_sincof_p0; y1 = _mm256_mul_ps(x21, tempa); y2 = _mm256_mul_ps(x22, tempa); avx_m256_t y21 = _mm256_mul_ps(x21, tempb); avx_m256_t y22 = _mm256_mul_ps(x22, tempb); tempa = _ps_coscof_p1; tempb = _ps_sincof_p1; y1 = _mm256_add_ps(y1, tempa); y2 = _mm256_add_ps(y2, tempa); y21 = _mm256_add_ps(y21, tempb); y22 = _mm256_add_ps(y22, tempb); y1 = _mm256_mul_ps(y1, x21); y2 = _mm256_mul_ps(y2, x22); y21 = _mm256_mul_ps(y21, x21); y22 = _mm256_mul_ps(y22, x22); tempa = _ps_coscof_p2; tempb = _ps_sincof_p2; y1 = _mm256_add_ps(y1, tempa); y2 = _mm256_add_ps(y2, tempa); y21 = _mm256_add_ps(y21, tempb); y22 = _mm256_add_ps(y22, tempb); y1 = _mm256_mul_ps(y1, x41); y2 = _mm256_mul_ps(y2, x42); y21 = _mm256_mul_ps(y21, x31); y22 = _mm256_mul_ps(y22, x32); tempa = _ps_0p5; tempb = _ps_1; avx_m256_t temp_21 = _mm256_mul_ps(x21, tempa); avx_m256_t temp_22 = _mm256_mul_ps(x22, tempa); y21 = _mm256_add_ps(y21, x1); y22 = _mm256_add_ps(y22, x2); temp_21 = _mm256_sub_ps(temp_21, tempb); temp_22 = _mm256_sub_ps(temp_22, tempb); y1 = _mm256_sub_ps(y1, temp_21); y2 = _mm256_sub_ps(y2, temp_22); avx_m256_t cos_y1 = y1; avx_m256_t cos_y2 = y2; avx_m256_t cos_y21 = y21; avx_m256_t cos_y22 = y22; y1 = _mm256_andnot_ps(emm2f1, y1); y2 = _mm256_andnot_ps(emm2f2, y2); cos_y1 = _mm256_andnot_ps(cos_emm2f1, cos_y1); cos_y2 = _mm256_andnot_ps(cos_emm2f2, cos_y2); y21 = _mm256_and_ps(emm2f1, y21); y22 = _mm256_and_ps(emm2f2, y22); cos_y21 = _mm256_and_ps(cos_emm2f1, cos_y21); cos_y22 = _mm256_and_ps(cos_emm2f2, cos_y22); y1 = _mm256_add_ps(y1, y21); y2 = _mm256_add_ps(y2, y22); cos_y1 = _mm256_add_ps(cos_y1, cos_y21); cos_y2 = _mm256_add_ps(cos_y2, cos_y22); *s1 = _mm256_xor_ps(y1, sign_bit1); *s2 = _mm256_xor_ps(y2, sign_bit2); *c1 = _mm256_xor_ps(cos_y1, cos_emm0f1); *c2 = _mm256_xor_ps(cos_y2, cos_emm0f2); } // newsincos_ps_dual()
inline void newsincos_ps(avx_m256_t x, avx_m256_t *s, avx_m256_t *c) { avx_m256_t sign_bit = _mm256_and_ps(x, _ps_sign_mask); x = _mm256_and_ps(x, _ps_inv_sign_mask); avx_m256_t y = _mm256_mul_ps(x, _ps_cephes_FOPI); //avx_m256i_t emm2 = _mm256_cvttps_epi32(y); //emm2 = _mm256_add_epi32(emm2, _pi32_1); avx_m256i_t emm2 = _mm256_cvttps_epi32(_mm256_add_ps(y, _ps_1)); //emm2 = _mm256_and_si256(emm2, _pi32_inv1); emm2 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm2), _mm256_castsi256_ps(_pi32_inv1))); y = _mm256_cvtepi32_ps(emm2); //avx_m256i_t cos_emm2 = _mm256_sub_epi32(emm2, _pi32_2); avx_m256i_t cos_emm2 = _mm256_cvtps_epi32(_mm256_sub_ps(_mm256_cvtepi32_ps(emm2), _ps_2)); //avx_m256i_t emm0 = _mm256_and_si256(emm2, _pi32_4); avx_m256i_t emm0 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm2), _mm256_castsi256_ps(_pi32_4))); //avx_m256i_t cos_emm0 = _mm256_andnot_si256(cos_emm2, _pi32_4); avx_m256i_t cos_emm0 = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(cos_emm2), _mm256_castsi256_ps(_pi32_4))); //emm0 = _mm256_slli_epi32(emm0, 29); __m128i emm0hi = _mm256_extractf128_si256(emm0, 0); __m128i emm0lo = _mm256_extractf128_si256(emm0, 1); emm0hi = _mm_slli_epi32(emm0hi, 29); emm0lo = _mm_slli_epi32(emm0lo, 29); emm0 = _mm256_insertf128_si256(emm0, emm0hi, 0); emm0 = _mm256_insertf128_si256(emm0, emm0lo, 1); //cos_emm0 = _mm256_slli_epi32(cos_emm0, 29); __m128i cos_emm0hi = _mm256_extractf128_si256(cos_emm0, 0); __m128i cos_emm0lo = _mm256_extractf128_si256(cos_emm0, 1); cos_emm0hi = _mm_slli_epi32(cos_emm0hi, 29); cos_emm0lo = _mm_slli_epi32(cos_emm0lo, 29); cos_emm0 = _mm256_insertf128_si256(cos_emm0, cos_emm0hi, 0); cos_emm0 = _mm256_insertf128_si256(cos_emm0, cos_emm0lo, 1); //emm2 = _mm256_and_si256(emm2, _pi32_2); emm2 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm2), _mm256_castsi256_ps(_pi32_2))); //cos_emm2 = _mm256_and_si256(cos_emm2, _pi32_2); cos_emm2 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(cos_emm2), _mm256_castsi256_ps(_pi32_2))); //emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256()); emm2 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(emm2), _mm256_setzero_ps(), _CMP_EQ_UQ)); //cos_emm2 = _mm256_cmpeq_epi32(cos_emm2, _mm256_setzero_si256()); cos_emm2 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(cos_emm2), _mm256_setzero_ps(), _CMP_EQ_UQ)); avx_m256_t emm0f = _mm256_castsi256_ps(emm0); avx_m256_t emm2f = _mm256_castsi256_ps(emm2); avx_m256_t cos_emm0f = _mm256_castsi256_ps(cos_emm0); avx_m256_t cos_emm2f = _mm256_castsi256_ps(cos_emm2); sign_bit = _mm256_xor_ps(sign_bit, emm0f); avx_m256_t temp_2 = _ps_minus_cephes_DP123; temp_2 = _mm256_mul_ps(y, temp_2); x = _mm256_add_ps(x, temp_2); avx_m256_t x2 = _mm256_mul_ps(x, x); avx_m256_t x3 = _mm256_mul_ps(x2, x); avx_m256_t x4 = _mm256_mul_ps(x2, x2); y = _ps_coscof_p0; avx_m256_t y2 = _ps_sincof_p0; y = _mm256_mul_ps(y, x2); y2 = _mm256_mul_ps(y2, x2); y = _mm256_add_ps(y, _ps_coscof_p1); y2 = _mm256_add_ps(y2, _ps_sincof_p1); y = _mm256_mul_ps(y, x2); y2 = _mm256_mul_ps(y2, x2); y = _mm256_add_ps(y, _ps_coscof_p2); y2 = _mm256_add_ps(y2, _ps_sincof_p2); y = _mm256_mul_ps(y, x4); y2 = _mm256_mul_ps(y2, x3); temp_2 = _mm256_mul_ps(x2, _ps_0p5); y2 = _mm256_add_ps(y2, x); temp_2 = _mm256_sub_ps(temp_2, _ps_1); y = _mm256_sub_ps(y, temp_2); avx_m256_t cos_y = y; avx_m256_t cos_y2 = y2; y = _mm256_andnot_ps(emm2f, y); cos_y = _mm256_andnot_ps(cos_emm2f, cos_y); y2 = _mm256_and_ps(emm2f, y2); cos_y2 = _mm256_and_ps(cos_emm2f, cos_y2); y = _mm256_add_ps(y, y2); cos_y = _mm256_add_ps(cos_y, cos_y2); *s = _mm256_xor_ps(y, sign_bit); *c = _mm256_xor_ps(cos_y, cos_emm0f); } // newsincos_ps()
inline void newexp_ps_dual(avx_m256_t x1, avx_m256_t x2, avx_m256_t* exp1, avx_m256_t* exp2) { avx_m256_t one = _ps_1; avx_m256_t zero = _ps_0; x1 = _mm256_min_ps(x1, _ps_exp_hi); x2 = _mm256_min_ps(x2, _ps_exp_hi); x1 = _mm256_max_ps(x1, _ps_exp_lo); x2 = _mm256_max_ps(x2, _ps_exp_lo); avx_m256_t temp_21 = _mm256_mul_ps(x1, _ps_cephes_LOG2EF); avx_m256_t temp_22 = _mm256_mul_ps(x2, _ps_cephes_LOG2EF); temp_21 = _mm256_add_ps(temp_21, _ps_0p5); temp_22 = _mm256_add_ps(temp_22, _ps_0p5); avx_m256i_t emm01 = _mm256_cvttps_epi32(temp_21); avx_m256i_t emm02 = _mm256_cvttps_epi32(temp_22); avx_m256_t temp_11 = _mm256_cvtepi32_ps(emm01); avx_m256_t temp_12 = _mm256_cvtepi32_ps(emm02); avx_m256_t temp_31 = _mm256_sub_ps(temp_11, temp_21); avx_m256_t temp_32 = _mm256_sub_ps(temp_12, temp_22); avx_m256_t mask1 = _mm256_cmp_ps(temp_31, zero, _CMP_GT_OQ); avx_m256_t mask2 = _mm256_cmp_ps(temp_32, zero, _CMP_GT_OQ); mask1 = _mm256_and_ps(mask1, one); mask2 = _mm256_and_ps(mask2, one); temp_21 = _mm256_sub_ps(temp_11, mask1); temp_22 = _mm256_sub_ps(temp_12, mask2); emm01 = _mm256_cvttps_epi32(temp_21); emm02 = _mm256_cvttps_epi32(temp_22); temp_11 = _mm256_mul_ps(temp_21, _ps_cephes_exp_C12); temp_12 = _mm256_mul_ps(temp_22, _ps_cephes_exp_C12); x1 = _mm256_sub_ps(x1, temp_11); x2 = _mm256_sub_ps(x2, temp_12); avx_m256_t x21 = _mm256_mul_ps(x1, x1); avx_m256_t x22 = _mm256_mul_ps(x2, x2); avx_m256_t x31 = _mm256_mul_ps(x21, x1); avx_m256_t x32 = _mm256_mul_ps(x22, x2); avx_m256_t x41 = _mm256_mul_ps(x21, x21); avx_m256_t x42 = _mm256_mul_ps(x22, x22); temp_11 = _mm256_add_ps(x1, one); temp_12 = _mm256_add_ps(x2, one); temp_21 = _mm256_mul_ps(x21, _ps_cephes_exp_p5); temp_22 = _mm256_mul_ps(x22, _ps_cephes_exp_p5); temp_31 = _mm256_mul_ps(x31, _ps_cephes_exp_p4); temp_32 = _mm256_mul_ps(x32, _ps_cephes_exp_p4); temp_11 = _mm256_add_ps(temp_11, temp_21); temp_12 = _mm256_add_ps(temp_12, temp_22); temp_21 = _mm256_mul_ps(x31, _ps_cephes_exp_p0); temp_22 = _mm256_mul_ps(x32, _ps_cephes_exp_p0); temp_11 = _mm256_add_ps(temp_11, temp_31); temp_12 = _mm256_add_ps(temp_12, temp_32); avx_m256_t temp_41 = _mm256_mul_ps(x1, _ps_cephes_exp_p2); avx_m256_t temp_42 = _mm256_mul_ps(x2, _ps_cephes_exp_p2); temp_31 = _mm256_mul_ps(x21, _ps_cephes_exp_p1); temp_32 = _mm256_mul_ps(x22, _ps_cephes_exp_p1); //emm01 = _mm256_add_epi32(emm01, _pi32_0x7f); //emm02 = _mm256_add_epi32(emm02, _pi32_0x7f); emm01 = _mm256_castps_si256(_mm256_add_ps(_mm256_castsi256_ps(emm01), _mm256_castsi256_ps(_pi32_0x7f))); emm02 = _mm256_castps_si256(_mm256_add_ps(_mm256_castsi256_ps(emm02), _mm256_castsi256_ps(_pi32_0x7f))); temp_21 = _mm256_add_ps(temp_21, temp_31); temp_22 = _mm256_add_ps(temp_22, temp_32); temp_31 = _mm256_add_ps(temp_31, temp_41); temp_32 = _mm256_add_ps(temp_32, temp_42); //emm01 = _mm256_slli_epi32(emm01, 23); __m128i emm0hi1 = _mm256_extractf128_si256(emm01, 0); __m128i emm0lo1 = _mm256_extractf128_si256(emm01, 1); emm0hi1 = _mm_slli_epi32(emm0hi1, 23); emm0lo1 = _mm_slli_epi32(emm0lo1, 23); emm01 = _mm256_insertf128_si256(emm01, emm0hi1, 0); emm01 = _mm256_insertf128_si256(emm01, emm0lo1, 1); //emm02 = _mm256_slli_epi32(emm02, 23); __m128i emm0hi2 = _mm256_extractf128_si256(emm02, 0); __m128i emm0lo2 = _mm256_extractf128_si256(emm02, 1); emm0hi2 = _mm_slli_epi32(emm0hi2, 23); emm0lo2 = _mm_slli_epi32(emm0lo2, 23); emm02 = _mm256_insertf128_si256(emm02, emm0hi2, 0); emm02 = _mm256_insertf128_si256(emm02, emm0lo2, 1); avx_m256_t pow2n1 = _mm256_castsi256_ps(emm01); avx_m256_t pow2n2 = _mm256_castsi256_ps(emm02); temp_21 = _mm256_add_ps(temp_21, temp_31); temp_22 = _mm256_add_ps(temp_22, temp_32); temp_21 = _mm256_mul_ps(temp_21, x41); temp_22 = _mm256_mul_ps(temp_22, x42); avx_m256_t y1 = _mm256_add_ps(temp_11, temp_21); avx_m256_t y2 = _mm256_add_ps(temp_12, temp_22); *exp1 = _mm256_mul_ps(y1, pow2n1); *exp2 = _mm256_mul_ps(y2, pow2n2); } // newexp_ps_dual()
__m256 mm256_cos_ps(__m256 x) { __m256 xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y; __m256i emm0, emm2; /* take the absolute value */ x = _mm256_and_ps(x, *(__m256*)m256_ps_inv_sign_mask); /* scale by 4/Pi */ y = _mm256_mul_ps(x, *(__m256*)m256_ps_cephes_FOPI); /* store the integer part of y in mm0 */ emm2 = _mm256_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ emm2 = _mm256_add_epi32(emm2, *(__m256i*)m256_pi32_1); emm2 = _mm256_and_si256(emm2, *(__m256i*)m256_pi32_inv1); y = _mm256_cvtepi32_ps(emm2); emm2 = _mm256_sub_epi32(emm2, *(__m256i*)m256_pi32_2); /* get the swap sign flag */ emm0 = _mm256_andnot_si256(emm2, *(__m256i*)m256_pi32_4); emm0 = _mm256_slli_epi32(emm0, 29); /* get the polynom selection mask */ emm2 = _mm256_and_si256(emm2, *(__m256i*)m256_pi32_2); emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256()); __m256 sign_bit = _mm256_castsi256_ps(emm0); __m256 poly_mask = _mm256_castsi256_ps(emm2); /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = *(__m256*)m256_ps_minus_cephes_DP1; xmm2 = *(__m256*)m256_ps_minus_cephes_DP2; xmm3 = *(__m256*)m256_ps_minus_cephes_DP3; xmm1 = _mm256_mul_ps(y, xmm1); xmm2 = _mm256_mul_ps(y, xmm2); xmm3 = _mm256_mul_ps(y, xmm3); x = _mm256_add_ps(x, xmm1); x = _mm256_add_ps(x, xmm2); x = _mm256_add_ps(x, xmm3); /* Evaluate the first polynom (0 <= x <= Pi/4) */ y = *(__m256*)m256_ps_coscof_p0; __m256 z = _mm256_mul_ps(x,x); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, *(__m256*)m256_ps_coscof_p1); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, *(__m256*)m256_ps_coscof_p2); y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z); __m256 tmp = _mm256_mul_ps(z, *(__m256*)m256_ps_0p5); y = _mm256_sub_ps(y, tmp); y = _mm256_add_ps(y, *(__m256*)m256_ps_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ __m256 y2 = *(__m256*)m256_ps_sincof_p0; y2 = _mm256_mul_ps(y2, z); y2 = _mm256_add_ps(y2, *(__m256*)m256_ps_sincof_p1); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_add_ps(y2, *(__m256*)m256_ps_sincof_p2); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, x); y2 = _mm256_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; y2 = _mm256_and_ps(xmm3, y2); //, xmm3); y = _mm256_andnot_ps(xmm3, y); y = _mm256_add_ps(y,y2); /* update the sign */ y = _mm256_xor_ps(y, sign_bit); _mm256_zeroupper(); return y; }
/* since sin256_ps and cos256_ps are almost identical, sincos256_ps could replace both of them.. it is almost as fast, and gives you a free cosine with your sine */ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) { v8sf xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y; v8si imm0, imm2, imm4; #ifndef __AVX2__ v4si imm0_1, imm0_2; v4si imm2_1, imm2_2; v4si imm4_1, imm4_2; #endif sign_bit_sin = x; /* take the absolute value */ x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask); /* extract the sign bit (upper one) */ sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf*)_ps256_sign_mask); /* scale by 4/Pi */ y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI); #ifdef __AVX2__ /* store the integer part of y in imm2 */ imm2 = _mm256_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1); imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_inv1); y = _mm256_cvtepi32_ps(imm2); imm4 = imm2; /* get the swap sign flag for the sine */ imm0 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_4); imm0 = _mm256_slli_epi32(imm0, 29); //v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0); /* get the polynom selection mask for the sine*/ imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_2); imm2 = _mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0); //v8sf poly_mask = _mm256_castsi256_ps(imm2); #else /* we use SSE2 routines to perform the integer ops */ COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2); imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1); imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1); imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1); imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1); COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2); y = _mm256_cvtepi32_ps(imm2); imm4_1 = imm2_1; imm4_2 = imm2_2; imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4); imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4); imm0_1 = _mm_slli_epi32(imm0_1, 29); imm0_2 = _mm_slli_epi32(imm0_2, 29); COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2); imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2); imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); #endif v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0); v8sf poly_mask = _mm256_castsi256_ps(imm2); /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = *(v8sf*)_ps256_minus_cephes_DP1; xmm2 = *(v8sf*)_ps256_minus_cephes_DP2; xmm3 = *(v8sf*)_ps256_minus_cephes_DP3; xmm1 = _mm256_mul_ps(y, xmm1); xmm2 = _mm256_mul_ps(y, xmm2); xmm3 = _mm256_mul_ps(y, xmm3); x = _mm256_add_ps(x, xmm1); x = _mm256_add_ps(x, xmm2); x = _mm256_add_ps(x, xmm3); #ifdef __AVX2__ imm4 = _mm256_sub_epi32(imm4, *(v8si*)_pi32_256_2); imm4 = _mm256_andnot_si128(imm4, *(v8si*)_pi32_256_4); imm4 = _mm256_slli_epi32(imm4, 29); #else imm4_1 = _mm_sub_epi32(imm4_1, *(v4si*)_pi32avx_2); imm4_2 = _mm_sub_epi32(imm4_2, *(v4si*)_pi32avx_2); imm4_1 = _mm_andnot_si128(imm4_1, *(v4si*)_pi32avx_4); imm4_2 = _mm_andnot_si128(imm4_2, *(v4si*)_pi32avx_4); imm4_1 = _mm_slli_epi32(imm4_1, 29); imm4_2 = _mm_slli_epi32(imm4_2, 29); COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4); #endif v8sf sign_bit_cos = _mm256_castsi256_ps(imm4); sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin); /* Evaluate the first polynom (0 <= x <= Pi/4) */ v8sf z = _mm256_mul_ps(x,x); y = *(v8sf*)_ps256_coscof_p0; y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2); y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z); v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5); y = _mm256_sub_ps(y, tmp); y = _mm256_add_ps(y, *(v8sf*)_ps256_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ v8sf y2 = *(v8sf*)_ps256_sincof_p0; y2 = _mm256_mul_ps(y2, z); y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, x); y2 = _mm256_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; v8sf ysin2 = _mm256_and_ps(xmm3, y2); v8sf ysin1 = _mm256_andnot_ps(xmm3, y); y2 = _mm256_sub_ps(y2,ysin2); y = _mm256_sub_ps(y, ysin1); xmm1 = _mm256_add_ps(ysin1,ysin2); xmm2 = _mm256_add_ps(y,y2); /* update the sign */ *s = _mm256_xor_ps(xmm1, sign_bit_sin); *c = _mm256_xor_ps(xmm2, sign_bit_cos); }
/* evaluation of 8 sines at onces using AVX intrisics The code is the exact rewriting of the cephes sinf function. Precision is excellent as long as x < 8192 (I did not bother to take into account the special handling they have for greater values -- it does not return garbage for arguments over 8192, though, but the extra precision is missing). Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the surprising but correct result. */ v8sf sin256_ps(v8sf x) { // any x v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y; v8si imm0, imm2; #ifndef __AVX2__ v4si imm0_1, imm0_2; v4si imm2_1, imm2_2; #endif sign_bit = x; /* take the absolute value */ x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask); /* extract the sign bit (upper one) */ sign_bit = _mm256_and_ps(sign_bit, *(v8sf*)_ps256_sign_mask); /* scale by 4/Pi */ y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI); /* Here we start a series of integer operations, which are in the realm of AVX2. If we don't have AVX, let's perform them using SSE2 directives */ #ifdef __AVX2__ /* store the integer part of y in mm0 */ imm2 = _mm256_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ // another two AVX2 instruction imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1); imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_inv1); y = _mm256_cvtepi32_ps(imm2); /* get the swap sign flag */ imm0 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_4); imm0 = _mm256_slli_epi32(imm0, 29); /* get the polynom selection mask there is one polynom for 0 <= x <= Pi/4 and another one for Pi/4<x<=Pi/2 Both branches will be computed. */ imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_2); imm2 = _mm256_cmpeq_epi32(imm2,*(v8si*)_pi32_256_0); #else /* we use SSE2 routines to perform the integer ops */ COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2); imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1); imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1); imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1); imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1); COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2); y = _mm256_cvtepi32_ps(imm2); imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4); imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4); imm0_1 = _mm_slli_epi32(imm0_1, 29); imm0_2 = _mm_slli_epi32(imm0_2, 29); COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2); imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2); imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); #endif v8sf swap_sign_bit = _mm256_castsi256_ps(imm0); v8sf poly_mask = _mm256_castsi256_ps(imm2); sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit); /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = *(v8sf*)_ps256_minus_cephes_DP1; xmm2 = *(v8sf*)_ps256_minus_cephes_DP2; xmm3 = *(v8sf*)_ps256_minus_cephes_DP3; xmm1 = _mm256_mul_ps(y, xmm1); xmm2 = _mm256_mul_ps(y, xmm2); xmm3 = _mm256_mul_ps(y, xmm3); x = _mm256_add_ps(x, xmm1); x = _mm256_add_ps(x, xmm2); x = _mm256_add_ps(x, xmm3); /* Evaluate the first polynom (0 <= x <= Pi/4) */ y = *(v8sf*)_ps256_coscof_p0; v8sf z = _mm256_mul_ps(x,x); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2); y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z); v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5); y = _mm256_sub_ps(y, tmp); y = _mm256_add_ps(y, *(v8sf*)_ps256_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ v8sf y2 = *(v8sf*)_ps256_sincof_p0; y2 = _mm256_mul_ps(y2, z); y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, x); y2 = _mm256_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; y2 = _mm256_and_ps(xmm3, y2); //, xmm3); y = _mm256_andnot_ps(xmm3, y); y = _mm256_add_ps(y,y2); /* update the sign */ y = _mm256_xor_ps(y, sign_bit); return y; }