/* AVX implementation for Minimum Mean Squared Error (MMSE) solver */ inline void srslte_mat_2x2_mmse_avx(__m256 y0, __m256 y1, __m256 h00, __m256 h01, __m256 h10, __m256 h11, __m256 *x0, __m256 *x1, float noise_estimate, float norm) { __m256 _noise_estimate = _mm256_set_ps(0.0f, noise_estimate, 0.0f, noise_estimate, 0.0f, noise_estimate, 0.0f, noise_estimate); __m256 _norm = _mm256_set1_ps(norm); /* Create conjugated matrix */ __m256 _h00 = _MM256_CONJ_PS(h00); __m256 _h01 = _MM256_CONJ_PS(h01); __m256 _h10 = _MM256_CONJ_PS(h10); __m256 _h11 = _MM256_CONJ_PS(h11); /* 1. A = H' x H + No*/ #ifdef LV_HAVE_FMA __m256 a00 = _MM256_SQMOD_ADD_PS(h00, h10, _noise_estimate); __m256 a01 = _MM256_PROD_ADD_PS(_h00, h01, _MM256_PROD_PS(_h10, h11)); __m256 a10 = _MM256_PROD_ADD_PS(_h01, h00, _MM256_PROD_PS(_h11, h10)); __m256 a11 = _MM256_SQMOD_ADD_PS(h01, h11, _noise_estimate); #else __m256 a00 = _mm256_add_ps(_MM256_SQMOD_PS(h00, h10), _noise_estimate); __m256 a01 = _mm256_add_ps(_MM256_PROD_PS(_h00, h01), _MM256_PROD_PS(_h10, h11)); __m256 a10 = _mm256_add_ps(_MM256_PROD_PS(_h01, h00), _MM256_PROD_PS(_h11, h10)); __m256 a11 = _mm256_add_ps(_MM256_SQMOD_PS(h01, h11), _noise_estimate); #endif /* LV_HAVE_FMA */ /* 2. B = inv(H' x H + No) = inv(A) */ __m256 b00 = a11; __m256 b01 = _mm256_xor_ps(a01, _mm256_set1_ps(-0.0f)); __m256 b10 = _mm256_xor_ps(a10, _mm256_set1_ps(-0.0f)); __m256 b11 = a00; _norm = _mm256_mul_ps(_norm, srslte_mat_cf_recip_avx(srslte_mat_2x2_det_avx(a00, a01, a10, a11))); /* 3. W = inv(H' x H + No) x H' = B x H' */ #ifdef LV_HAVE_FMA __m256 w00 = _MM256_PROD_ADD_PS(b00, _h00, _MM256_PROD_PS(b01, _h01)); __m256 w01 = _MM256_PROD_ADD_PS(b00, _h10, _MM256_PROD_PS(b01, _h11)); __m256 w10 = _MM256_PROD_ADD_PS(b10, _h00, _MM256_PROD_PS(b11, _h01)); __m256 w11 = _MM256_PROD_ADD_PS(b10, _h10, _MM256_PROD_PS(b11, _h11)); #else __m256 w00 = _mm256_add_ps(_MM256_PROD_PS(b00, _h00), _MM256_PROD_PS(b01, _h01)); __m256 w01 = _mm256_add_ps(_MM256_PROD_PS(b00, _h10), _MM256_PROD_PS(b01, _h11)); __m256 w10 = _mm256_add_ps(_MM256_PROD_PS(b10, _h00), _MM256_PROD_PS(b11, _h01)); __m256 w11 = _mm256_add_ps(_MM256_PROD_PS(b10, _h10), _MM256_PROD_PS(b11, _h11)); #endif /* LV_HAVE_FMA */ /* 4. X = W x Y */ #ifdef LV_HAVE_FMA *x0 = _MM256_PROD_PS(_MM256_PROD_ADD_PS(y0, w00, _MM256_PROD_PS(y1, w01)), _norm); *x1 = _MM256_PROD_PS(_MM256_PROD_ADD_PS(y0, w10, _MM256_PROD_PS(y1, w11)), _norm); #else *x0 = _MM256_PROD_PS(_mm256_add_ps(_MM256_PROD_PS(y0, w00), _MM256_PROD_PS(y1, w01)), _norm); *x1 = _MM256_PROD_PS(_mm256_add_ps(_MM256_PROD_PS(y0, w10), _MM256_PROD_PS(y1, w11)), _norm); #endif /* LV_HAVE_FMA */ }
inline avx_m256_t newsin_ps(avx_m256_t x) { avx_m256_t sign_bit = _mm256_and_ps(x, _ps_sign_mask); x = _mm256_and_ps(x, _ps_inv_sign_mask); avx_m256_t y = _mm256_mul_ps(x, _ps_cephes_FOPI); avx_m256i_t emm2 = _mm256_cvttps_epi32(y); emm2 = _mm256_add_epi32(emm2, _pi32_1); emm2 = _mm256_and_si256(emm2, _pi32_inv1); y = _mm256_cvtepi32_ps(emm2); avx_m256i_t emm0 = _mm256_and_si256(emm2, _pi32_4); emm0 = _mm256_slli_epi32(emm0, 29); emm2 = _mm256_and_si256(emm2, _pi32_2); emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256()); avx_m256_t swap_sign_bit = _mm256_castsi256_ps(emm0); avx_m256_t poly_mask = _mm256_castsi256_ps(emm2); sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit); avx_m256_t temp = _ps_minus_cephes_DP123; temp = _mm256_mul_ps(y, temp); x = _mm256_add_ps(x, temp); avx_m256_t x2 = _mm256_mul_ps(x, x); avx_m256_t x3 = _mm256_mul_ps(x2, x); avx_m256_t x4 = _mm256_mul_ps(x2, x2); y = _ps_coscof_p0; avx_m256_t y2 = _ps_sincof_p0; y = _mm256_mul_ps(y, x2); y2 = _mm256_mul_ps(y2, x2); y = _mm256_add_ps(y, _ps_coscof_p1); y2 = _mm256_add_ps(y2, _ps_sincof_p1); y = _mm256_mul_ps(y, x2); y2 = _mm256_mul_ps(y2, x2); y = _mm256_add_ps(y, _ps_coscof_p2); y2 = _mm256_add_ps(y2, _ps_sincof_p2); y = _mm256_mul_ps(y, x4); y2 = _mm256_mul_ps(y2, x3); temp = _mm256_mul_ps(x2, _ps_0p5); temp = _mm256_sub_ps(temp, _ps_1); y = _mm256_sub_ps(y, temp); y2 = _mm256_add_ps(y2, x); y = _mm256_andnot_ps(poly_mask, y); y2 = _mm256_and_ps(poly_mask, y2); y = _mm256_add_ps(y, y2); y = _mm256_xor_ps(y, sign_bit); return y; } // newsin_ps()
/*---------------------------------------------------------------------------*/ __m256 TTriangle::THit::HitTest8(__m256 mask, const TPoint8& orig, const D3DXVECTOR3& d, HitResult8* result) const { int u, v, w; w = ci; u = w == 0 ? 1 : 0; v = w == 2 ? 1 : 2; __m256 nu = _mm256_broadcast_ss(&this->nu); __m256 np = _mm256_broadcast_ss(&this->np); __m256 nv = _mm256_broadcast_ss(&this->nv); __m256 pu = _mm256_broadcast_ss(&this->pu); __m256 pv = _mm256_broadcast_ss(&this->pv); __m256 e0u = _mm256_broadcast_ss(&this->e0u); __m256 e0v = _mm256_broadcast_ss(&this->e0v); __m256 e1u = _mm256_broadcast_ss(&this->e1u); __m256 e1v = _mm256_broadcast_ss(&this->e1v); __m256 ou = orig[u]; __m256 ov = orig[v]; __m256 ow = orig[w]; __m256 du = _mm256_broadcast_ss(&d[u]); __m256 dv = _mm256_broadcast_ss(&d[v]); __m256 dw = _mm256_broadcast_ss(&d[w]); __m256 dett = np -(ou*nu+ov*nv+ow); __m256 det = du*nu+dv*nv+dw; __m256 Du = du*dett - (pu-ou)*det; __m256 Dv = dv*dett - (pv-ov)*det; __m256 detu = (e1v*Du - e1u*Dv); __m256 detv = (e0u*Dv - e0v*Du); __m256 tmpdet0 = det - detu - detv; __m256 detMask = _mm256_xor_ps(_mm256_xor_ps(tmpdet0, detv) | _mm256_xor_ps(detv, detu), g_one8) > _mm256_setzero_ps(); mask = mask & detMask; __m256 rdet = _mm256_rcp_ps(det); result->t = dett * rdet; result->u = detu * rdet; result->v = detv * rdet; return mask & (result->t > _mm256_setzero_ps()); /**/ }
/* V_SgMinusOp */ __SIMD _SIMD_neg_ps(__SIMD a) { #ifdef USE_SSE return _mm_xor_ps(a, _mm_set1_ps(-0.0f)); #elif defined USE_AVX return _mm256_xor_ps(a, _mm_set1_ps(-0.0f)); #elif defined USE_IBM return vec_neg(a); #endif }
void plot(u32 w, u32 h, float x1, float y1, float x2, float y2, float dx, float dy, u32 max_iter = 4096) { assert(w % 8 == 0); // AVX Constants float const constants[] { x1, y1, dx, dy, 1.0f, 4.0f }; __m256 const vx1 = _mm256_broadcast_ss(constants); __m256 const vy1 = _mm256_broadcast_ss(constants + 1); __m256 const vdx = _mm256_broadcast_ss(constants + 2); __m256 const vdy = _mm256_broadcast_ss(constants + 3); __m256 const v1 = _mm256_broadcast_ss(constants + 4); __m256 const v4 = _mm256_broadcast_ss(constants + 5); // Start timing std::chrono::time_point<std::chrono::high_resolution_clock> t1, t2; std::chrono::duration<double> dt; t1 = std::chrono::high_resolution_clock::now(); // Zero line counter __m256 vj = _mm256_xor_ps(v1, v1); for (u32 j = 0; j < h; j++) { for (u32 i = 0; i < w; i += 8) { // Fill column counter float const vi_[8] { i+0.f, i+1.f, i+2.f, i+3.f, i+4.f, i+5.f, i+6.f, i+7.f }; __m256 vi = _mm256_load_ps(vi_); // Compute start point __m256 vx0 = _mm256_mul_ps(vi, vdx); vx0 = _mm256_add_ps(vx0, vx1); __m256 vy0 = _mm256_mul_ps(vj, vdy); vy0 = _mm256_add_ps(vy0, vy1); __m256 vx = vx0; __m256 vy = vy0; __m256 vcount = _mm256_xor_ps(v1, v1); // Zero iteration counter u32 iter = 0; u8 no_overflow = 0; do { // Compute products __m256 vxx = _mm256_mul_ps(vx, vx); __m256 vyy = _mm256_mul_ps(vy, vy); // Check termination condition __m256 vtmp = _mm256_add_ps(vxx, vyy); vtmp = _mm256_cmp_ps(vtmp, v4, _CMP_LT_OQ); no_overflow = _mm256_movemask_ps(vtmp) & 0xff; // Accumulate iteration counter vtmp = _mm256_and_ps(vtmp, v1); vcount = _mm256_add_ps(vcount, vtmp); // Step vtmp = _mm256_mul_ps(vx, vy); vtmp = _mm256_add_ps(vtmp, vtmp); vy = _mm256_add_ps(vtmp, vy0); vtmp = _mm256_sub_ps(vxx, vyy); vx = _mm256_add_ps(vtmp, vx0); ++iter; } while (no_overflow && (iter < max_iter)); for (u32 k = 0; k < 8; k++) { u32 n = ((float *) &vcount)[k] + 0.5f; if (n == max_iter) n = 0; char c = ' '; if (n > 0) { static char const charset[] = ".,c8M@jawrpogOQEPGJ"; c = charset[n % (sizeof(charset) - 1)]; } attron(COLOR_PAIR((n % 7) + 1)); addch(c); attroff(COLOR_PAIR((n % 7) + 1)); if (i + k + 1 == w) addch('\n'); } } // Increment line counter vj = _mm256_add_ps(vj, v1); } // End timing t2 = std::chrono::high_resolution_clock::now(); dt = t2 - t1; std::string info = std::to_string(dt.count() * 1000.0) + "ms"; attron(COLOR_PAIR(1)); printw(info.c_str()); attroff(COLOR_PAIR(1)); }
inline vec8 operator!(vec8 a) { const unsigned i = 0xFFFFFFFF; return _mm256_xor_ps(_mm256_set1_ps(*(float*)&i), a); }
inline vec8 operator^(vec8 a, vec8 b) { return _mm256_xor_ps(a, b); }
/*! * \brief Compute the negative of each element in the given vector * \return a vector containing the negative of each input element */ ETL_STATIC_INLINE(avx_simd_float) minus(avx_simd_float x) { return _mm256_xor_ps(x.value, _mm256_set1_ps(-0.f)); }
/* since sin256_ps and cos256_ps are almost identical, sincos256_ps could replace both of them.. it is almost as fast, and gives you a free cosine with your sine */ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) { v8sf xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y; v8si imm0, imm2, imm4; #ifndef __AVX2__ v4si imm0_1, imm0_2; v4si imm2_1, imm2_2; v4si imm4_1, imm4_2; #endif sign_bit_sin = x; /* take the absolute value */ x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask); /* extract the sign bit (upper one) */ sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf*)_ps256_sign_mask); /* scale by 4/Pi */ y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI); #ifdef __AVX2__ /* store the integer part of y in imm2 */ imm2 = _mm256_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1); imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_inv1); y = _mm256_cvtepi32_ps(imm2); imm4 = imm2; /* get the swap sign flag for the sine */ imm0 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_4); imm0 = _mm256_slli_epi32(imm0, 29); //v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0); /* get the polynom selection mask for the sine*/ imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_2); imm2 = _mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0); //v8sf poly_mask = _mm256_castsi256_ps(imm2); #else /* we use SSE2 routines to perform the integer ops */ COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2); imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1); imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1); imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1); imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1); COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2); y = _mm256_cvtepi32_ps(imm2); imm4_1 = imm2_1; imm4_2 = imm2_2; imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4); imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4); imm0_1 = _mm_slli_epi32(imm0_1, 29); imm0_2 = _mm_slli_epi32(imm0_2, 29); COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2); imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2); imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); #endif v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0); v8sf poly_mask = _mm256_castsi256_ps(imm2); /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = *(v8sf*)_ps256_minus_cephes_DP1; xmm2 = *(v8sf*)_ps256_minus_cephes_DP2; xmm3 = *(v8sf*)_ps256_minus_cephes_DP3; xmm1 = _mm256_mul_ps(y, xmm1); xmm2 = _mm256_mul_ps(y, xmm2); xmm3 = _mm256_mul_ps(y, xmm3); x = _mm256_add_ps(x, xmm1); x = _mm256_add_ps(x, xmm2); x = _mm256_add_ps(x, xmm3); #ifdef __AVX2__ imm4 = _mm256_sub_epi32(imm4, *(v8si*)_pi32_256_2); imm4 = _mm256_andnot_si128(imm4, *(v8si*)_pi32_256_4); imm4 = _mm256_slli_epi32(imm4, 29); #else imm4_1 = _mm_sub_epi32(imm4_1, *(v4si*)_pi32avx_2); imm4_2 = _mm_sub_epi32(imm4_2, *(v4si*)_pi32avx_2); imm4_1 = _mm_andnot_si128(imm4_1, *(v4si*)_pi32avx_4); imm4_2 = _mm_andnot_si128(imm4_2, *(v4si*)_pi32avx_4); imm4_1 = _mm_slli_epi32(imm4_1, 29); imm4_2 = _mm_slli_epi32(imm4_2, 29); COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4); #endif v8sf sign_bit_cos = _mm256_castsi256_ps(imm4); sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin); /* Evaluate the first polynom (0 <= x <= Pi/4) */ v8sf z = _mm256_mul_ps(x,x); y = *(v8sf*)_ps256_coscof_p0; y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2); y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z); v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5); y = _mm256_sub_ps(y, tmp); y = _mm256_add_ps(y, *(v8sf*)_ps256_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ v8sf y2 = *(v8sf*)_ps256_sincof_p0; y2 = _mm256_mul_ps(y2, z); y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, x); y2 = _mm256_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; v8sf ysin2 = _mm256_and_ps(xmm3, y2); v8sf ysin1 = _mm256_andnot_ps(xmm3, y); y2 = _mm256_sub_ps(y2,ysin2); y = _mm256_sub_ps(y, ysin1); xmm1 = _mm256_add_ps(ysin1,ysin2); xmm2 = _mm256_add_ps(y,y2); /* update the sign */ *s = _mm256_xor_ps(xmm1, sign_bit_sin); *c = _mm256_xor_ps(xmm2, sign_bit_cos); }
inline void newsincos_ps(avx_m256_t x, avx_m256_t *s, avx_m256_t *c) { avx_m256_t sign_bit = _mm256_and_ps(x, _ps_sign_mask); x = _mm256_and_ps(x, _ps_inv_sign_mask); avx_m256_t y = _mm256_mul_ps(x, _ps_cephes_FOPI); //avx_m256i_t emm2 = _mm256_cvttps_epi32(y); //emm2 = _mm256_add_epi32(emm2, _pi32_1); avx_m256i_t emm2 = _mm256_cvttps_epi32(_mm256_add_ps(y, _ps_1)); //emm2 = _mm256_and_si256(emm2, _pi32_inv1); emm2 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm2), _mm256_castsi256_ps(_pi32_inv1))); y = _mm256_cvtepi32_ps(emm2); //avx_m256i_t cos_emm2 = _mm256_sub_epi32(emm2, _pi32_2); avx_m256i_t cos_emm2 = _mm256_cvtps_epi32(_mm256_sub_ps(_mm256_cvtepi32_ps(emm2), _ps_2)); //avx_m256i_t emm0 = _mm256_and_si256(emm2, _pi32_4); avx_m256i_t emm0 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm2), _mm256_castsi256_ps(_pi32_4))); //avx_m256i_t cos_emm0 = _mm256_andnot_si256(cos_emm2, _pi32_4); avx_m256i_t cos_emm0 = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(cos_emm2), _mm256_castsi256_ps(_pi32_4))); //emm0 = _mm256_slli_epi32(emm0, 29); __m128i emm0hi = _mm256_extractf128_si256(emm0, 0); __m128i emm0lo = _mm256_extractf128_si256(emm0, 1); emm0hi = _mm_slli_epi32(emm0hi, 29); emm0lo = _mm_slli_epi32(emm0lo, 29); emm0 = _mm256_insertf128_si256(emm0, emm0hi, 0); emm0 = _mm256_insertf128_si256(emm0, emm0lo, 1); //cos_emm0 = _mm256_slli_epi32(cos_emm0, 29); __m128i cos_emm0hi = _mm256_extractf128_si256(cos_emm0, 0); __m128i cos_emm0lo = _mm256_extractf128_si256(cos_emm0, 1); cos_emm0hi = _mm_slli_epi32(cos_emm0hi, 29); cos_emm0lo = _mm_slli_epi32(cos_emm0lo, 29); cos_emm0 = _mm256_insertf128_si256(cos_emm0, cos_emm0hi, 0); cos_emm0 = _mm256_insertf128_si256(cos_emm0, cos_emm0lo, 1); //emm2 = _mm256_and_si256(emm2, _pi32_2); emm2 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm2), _mm256_castsi256_ps(_pi32_2))); //cos_emm2 = _mm256_and_si256(cos_emm2, _pi32_2); cos_emm2 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(cos_emm2), _mm256_castsi256_ps(_pi32_2))); //emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256()); emm2 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(emm2), _mm256_setzero_ps(), _CMP_EQ_UQ)); //cos_emm2 = _mm256_cmpeq_epi32(cos_emm2, _mm256_setzero_si256()); cos_emm2 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(cos_emm2), _mm256_setzero_ps(), _CMP_EQ_UQ)); avx_m256_t emm0f = _mm256_castsi256_ps(emm0); avx_m256_t emm2f = _mm256_castsi256_ps(emm2); avx_m256_t cos_emm0f = _mm256_castsi256_ps(cos_emm0); avx_m256_t cos_emm2f = _mm256_castsi256_ps(cos_emm2); sign_bit = _mm256_xor_ps(sign_bit, emm0f); avx_m256_t temp_2 = _ps_minus_cephes_DP123; temp_2 = _mm256_mul_ps(y, temp_2); x = _mm256_add_ps(x, temp_2); avx_m256_t x2 = _mm256_mul_ps(x, x); avx_m256_t x3 = _mm256_mul_ps(x2, x); avx_m256_t x4 = _mm256_mul_ps(x2, x2); y = _ps_coscof_p0; avx_m256_t y2 = _ps_sincof_p0; y = _mm256_mul_ps(y, x2); y2 = _mm256_mul_ps(y2, x2); y = _mm256_add_ps(y, _ps_coscof_p1); y2 = _mm256_add_ps(y2, _ps_sincof_p1); y = _mm256_mul_ps(y, x2); y2 = _mm256_mul_ps(y2, x2); y = _mm256_add_ps(y, _ps_coscof_p2); y2 = _mm256_add_ps(y2, _ps_sincof_p2); y = _mm256_mul_ps(y, x4); y2 = _mm256_mul_ps(y2, x3); temp_2 = _mm256_mul_ps(x2, _ps_0p5); y2 = _mm256_add_ps(y2, x); temp_2 = _mm256_sub_ps(temp_2, _ps_1); y = _mm256_sub_ps(y, temp_2); avx_m256_t cos_y = y; avx_m256_t cos_y2 = y2; y = _mm256_andnot_ps(emm2f, y); cos_y = _mm256_andnot_ps(cos_emm2f, cos_y); y2 = _mm256_and_ps(emm2f, y2); cos_y2 = _mm256_and_ps(cos_emm2f, cos_y2); y = _mm256_add_ps(y, y2); cos_y = _mm256_add_ps(cos_y, cos_y2); *s = _mm256_xor_ps(y, sign_bit); *c = _mm256_xor_ps(cos_y, cos_emm0f); } // newsincos_ps()
INLINE const avxi operator ^( const avxi& a, const avxi& b ) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
__m256 mm256_cos_ps(__m256 x) { __m256 xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y; __m256i emm0, emm2; /* take the absolute value */ x = _mm256_and_ps(x, *(__m256*)m256_ps_inv_sign_mask); /* scale by 4/Pi */ y = _mm256_mul_ps(x, *(__m256*)m256_ps_cephes_FOPI); /* store the integer part of y in mm0 */ emm2 = _mm256_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ emm2 = _mm256_add_epi32(emm2, *(__m256i*)m256_pi32_1); emm2 = _mm256_and_si256(emm2, *(__m256i*)m256_pi32_inv1); y = _mm256_cvtepi32_ps(emm2); emm2 = _mm256_sub_epi32(emm2, *(__m256i*)m256_pi32_2); /* get the swap sign flag */ emm0 = _mm256_andnot_si256(emm2, *(__m256i*)m256_pi32_4); emm0 = _mm256_slli_epi32(emm0, 29); /* get the polynom selection mask */ emm2 = _mm256_and_si256(emm2, *(__m256i*)m256_pi32_2); emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256()); __m256 sign_bit = _mm256_castsi256_ps(emm0); __m256 poly_mask = _mm256_castsi256_ps(emm2); /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = *(__m256*)m256_ps_minus_cephes_DP1; xmm2 = *(__m256*)m256_ps_minus_cephes_DP2; xmm3 = *(__m256*)m256_ps_minus_cephes_DP3; xmm1 = _mm256_mul_ps(y, xmm1); xmm2 = _mm256_mul_ps(y, xmm2); xmm3 = _mm256_mul_ps(y, xmm3); x = _mm256_add_ps(x, xmm1); x = _mm256_add_ps(x, xmm2); x = _mm256_add_ps(x, xmm3); /* Evaluate the first polynom (0 <= x <= Pi/4) */ y = *(__m256*)m256_ps_coscof_p0; __m256 z = _mm256_mul_ps(x,x); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, *(__m256*)m256_ps_coscof_p1); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, *(__m256*)m256_ps_coscof_p2); y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z); __m256 tmp = _mm256_mul_ps(z, *(__m256*)m256_ps_0p5); y = _mm256_sub_ps(y, tmp); y = _mm256_add_ps(y, *(__m256*)m256_ps_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ __m256 y2 = *(__m256*)m256_ps_sincof_p0; y2 = _mm256_mul_ps(y2, z); y2 = _mm256_add_ps(y2, *(__m256*)m256_ps_sincof_p1); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_add_ps(y2, *(__m256*)m256_ps_sincof_p2); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, x); y2 = _mm256_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; y2 = _mm256_and_ps(xmm3, y2); //, xmm3); y = _mm256_andnot_ps(xmm3, y); y = _mm256_add_ps(y,y2); /* update the sign */ y = _mm256_xor_ps(y, sign_bit); _mm256_zeroupper(); return y; }
void animate() { float mx; float my; if(ManualControl) { POINT pos; GetCursorPos(&pos); RECT rc; GetClientRect(hMainWnd, &rc); ScreenToClient(hMainWnd, &pos); mx = pos.x; my = pos.y; } else { UpdatePosition(mx, my); } const auto size = partCount; VertexData *pVertexBuffer; pVertexObject->Lock(0, 0, (void**)&pVertexBuffer, D3DLOCK_DISCARD); _mm256_zeroall(); #pragma omp parallel \ shared(pVertexBuffer, particlesCoord, particlesVel, mx, my, size) { #pragma omp for nowait for(int i = 0; i < size; i += 4) { float mouseCoordVec[8] = { mx, my, mx, my, mx, my, mx, my }; float *particleCoordsVec = (float*)particlesCoord + i; float *velocityVec = (float*)particlesVel + i; auto xyCoord = _mm256_loadu_ps(particleCoordsVec); auto hwTempData = _mm256_sub_ps(xyCoord, _mm256_loadu_ps(mouseCoordVec)); auto squares = _mm256_mul_ps(hwTempData, hwTempData); auto distSquare = _mm256_hadd_ps(squares, squares); distSquare = _mm256_shuffle_ps(distSquare, distSquare, 0x50); auto theForce = _mm256_div_ps(_mm256_set1_ps(G), distSquare); if(distSquare.m256_f32[0] < 400) { theForce.m256_f32[0] = 0; theForce.m256_f32[1] = 0; } if(distSquare.m256_f32[2] < 400) { theForce.m256_f32[2] = 0; theForce.m256_f32[3] = 0; } if(distSquare.m256_f32[4] < 400) { theForce.m256_f32[4] = 0; theForce.m256_f32[5] = 0; } if(distSquare.m256_f32[6] < 400) { theForce.m256_f32[6] = 0; theForce.m256_f32[7] = 0; } auto xyForces = _mm256_mul_ps(_mm256_xor_ps(hwTempData, _mm256_set1_ps(-0.f)), theForce); auto xyVelocities = _mm256_loadu_ps(velocityVec); xyVelocities = _mm256_mul_ps(xyVelocities, _mm256_set1_ps(Resistance)); xyVelocities = _mm256_add_ps(xyVelocities, xyForces); xyCoord = _mm256_add_ps(xyCoord, xyVelocities); _mm256_storeu_ps(velocityVec, xyVelocities); _mm256_storeu_ps(particleCoordsVec, xyCoord); processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[0], ((ParticleVel*)velocityVec)[0]); processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[1], ((ParticleVel*)velocityVec)[1]); processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[2], ((ParticleVel*)velocityVec)[2]); processIfOutOfBounds(((ParticleCoord*)particleCoordsVec)[3], ((ParticleVel*)velocityVec)[3]); pVertexBuffer[i].x = ((ParticleCoord*)particleCoordsVec)[0].x; pVertexBuffer[i].y = ((ParticleCoord*)particleCoordsVec)[0].y; pVertexBuffer[i + 1].x = ((ParticleCoord*)particleCoordsVec)[1].x; pVertexBuffer[i + 1].y = ((ParticleCoord*)particleCoordsVec)[1].y; pVertexBuffer[i + 2].x = ((ParticleCoord*)particleCoordsVec)[2].x; pVertexBuffer[i + 2].y = ((ParticleCoord*)particleCoordsVec)[2].y; pVertexBuffer[i + 3].x = ((ParticleCoord*)particleCoordsVec)[3].x; pVertexBuffer[i + 3].y = ((ParticleCoord*)particleCoordsVec)[3].y; } } pVertexObject->Unlock(); _mm256_zeroall(); }
/* * Bitwise NOT operation for integers */ inline __m256i not_si256(const __m256i x) { static const __m256i mask = _mm256_set1_epi32(0xFFFFFFFF); return CAST_REAL_TO_INT_V(_mm256_xor_ps(CAST_INT_TO_REAL_V(mask), CAST_INT_TO_REAL_V(x))); }
inline void newsincos_ps_dual(avx_m256_t x1, avx_m256_t x2, avx_m256_t *s1, avx_m256_t *s2, avx_m256_t *c1, avx_m256_t *c2) { avx_m256_t tempa = _ps_sign_mask; avx_m256_t tempb = _ps_inv_sign_mask; avx_m256_t sign_bit1 = _mm256_and_ps(x1, tempa); avx_m256_t sign_bit2 = _mm256_and_ps(x2, tempa); x1 = _mm256_and_ps(x1, tempb); x2 = _mm256_and_ps(x2, tempb); tempa = _ps_cephes_FOPI; avx_m256_t y1 = _mm256_mul_ps(x1, tempa); avx_m256_t y2 = _mm256_mul_ps(x2, tempa); //avx_m256i_t emm21 = _mm256_cvttps_epi32(y1); //avx_m256i_t emm22 = _mm256_cvttps_epi32(y2); //emm21 = _mm256_add_epi32(emm21, _pi32_1); //emm22 = _mm256_add_epi32(emm22, _pi32_1); avx_m256i_t emm21 = _mm256_cvttps_epi32(_mm256_add_ps(y1, _ps_1)); avx_m256i_t emm22 = _mm256_cvttps_epi32(_mm256_add_ps(y2, _ps_1)); //emm21 = _mm256_and_si256(emm21, _pi32_inv1); //emm22 = _mm256_and_si256(emm22, _pi32_inv1); emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21), _mm256_castsi256_ps(_pi32_inv1))); emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22), _mm256_castsi256_ps(_pi32_inv1))); y1 = _mm256_cvtepi32_ps(emm21); y2 = _mm256_cvtepi32_ps(emm22); //avx_m256i_t tempia = _pi32_2; //avx_m256i_t cos_emm21 = _mm256_sub_epi32(emm21, tempia); //avx_m256i_t cos_emm22 = _mm256_sub_epi32(emm22, tempia); avx_m256i_t cos_emm21 = _mm256_cvtps_epi32(_mm256_sub_ps(_mm256_cvtepi32_ps(emm21), _ps_2)); avx_m256i_t cos_emm22 = _mm256_cvtps_epi32(_mm256_sub_ps(_mm256_cvtepi32_ps(emm22), _ps_2)); //avx_m256i_t tempib = _pi32_4; //avx_m256i_t emm01 = _mm256_and_si256(emm21, tempib); //avx_m256i_t emm02 = _mm256_and_si256(emm22, tempib); avx_m256i_t emm01 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21), _mm256_castsi256_ps(_pi32_4))); avx_m256i_t emm02 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22), _mm256_castsi256_ps(_pi32_4))); //avx_m256i_t cos_emm01 = _mm256_andnot_si256(cos_emm21, tempib); //avx_m256i_t cos_emm02 = _mm256_andnot_si256(cos_emm22, tempib); avx_m256i_t cos_emm01 = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(cos_emm21), _mm256_castsi256_ps(_pi32_4))); avx_m256i_t cos_emm02 = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(cos_emm22), _mm256_castsi256_ps(_pi32_4))); //emm01 = _mm256_slli_epi32(emm01, 29); __m128i emm0hi1 = _mm256_extractf128_si256(emm01, 0); __m128i emm0lo1 = _mm256_extractf128_si256(emm01, 1); emm0hi1 = _mm_slli_epi32(emm0hi1, 29); emm0lo1 = _mm_slli_epi32(emm0lo1, 29); emm01 = _mm256_insertf128_si256(emm01, emm0hi1, 0); emm01 = _mm256_insertf128_si256(emm01, emm0lo1, 1); //emm02 = _mm256_slli_epi32(emm02, 29); __m128i emm0hi2 = _mm256_extractf128_si256(emm02, 0); __m128i emm0lo2 = _mm256_extractf128_si256(emm02, 1); emm0hi2 = _mm_slli_epi32(emm0hi2, 29); emm0lo2 = _mm_slli_epi32(emm0lo2, 29); emm02 = _mm256_insertf128_si256(emm02, emm0hi1, 0); emm02 = _mm256_insertf128_si256(emm02, emm0lo1, 1); //cos_emm01 = _mm256_slli_epi32(cos_emm01, 29); __m128i cos_emm0hi1 = _mm256_extractf128_si256(cos_emm01, 0); __m128i cos_emm0lo1 = _mm256_extractf128_si256(cos_emm01, 1); cos_emm0hi1 = _mm_slli_epi32(cos_emm0hi1, 29); cos_emm0lo1 = _mm_slli_epi32(cos_emm0lo1, 29); cos_emm01 = _mm256_insertf128_si256(cos_emm01, cos_emm0hi1, 0); cos_emm01 = _mm256_insertf128_si256(cos_emm01, cos_emm0lo1, 1); //cos_emm02 = _mm256_slli_epi32(cos_emm02, 29); __m128i cos_emm0hi2 = _mm256_extractf128_si256(cos_emm02, 0); __m128i cos_emm0lo2 = _mm256_extractf128_si256(cos_emm02, 1); cos_emm0hi2 = _mm_slli_epi32(cos_emm0hi2, 29); cos_emm0lo2 = _mm_slli_epi32(cos_emm0lo2, 29); cos_emm02 = _mm256_insertf128_si256(cos_emm02, cos_emm0hi2, 0); cos_emm02 = _mm256_insertf128_si256(cos_emm02, cos_emm0lo2, 1); //tempia = _pi32_2; //tempib = _mm256_setzero_si256(); //emm21 = _mm256_and_si256(emm21, tempia); //emm22 = _mm256_and_si256(emm22, tempia); emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21), _mm256_castsi256_ps(_pi32_2))); emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22), _mm256_castsi256_ps(_pi32_2))); //cos_emm21 = _mm256_and_si256(cos_emm21, tempia); //cos_emm22 = _mm256_and_si256(cos_emm22, tempia); cos_emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(cos_emm21), _mm256_castsi256_ps(_pi32_2))); cos_emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(cos_emm22), _mm256_castsi256_ps(_pi32_2))); //emm21 = _mm256_cmpeq_epi32(emm21, tempib); //emm22 = _mm256_cmpeq_epi32(emm22, tempib); emm21 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(emm21), _mm256_setzero_ps(), _CMP_EQ_UQ)); emm22 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(emm22), _mm256_setzero_ps(), _CMP_EQ_UQ)); //cos_emm21 = _mm256_cmpeq_epi32(cos_emm21, tempib); //cos_emm22 = _mm256_cmpeq_epi32(cos_emm22, tempib); cos_emm21 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(cos_emm21), _mm256_setzero_ps(), _CMP_EQ_UQ)); cos_emm22 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(cos_emm22), _mm256_setzero_ps(), _CMP_EQ_UQ)); avx_m256_t emm0f1 = _mm256_castsi256_ps(emm01); avx_m256_t emm0f2 = _mm256_castsi256_ps(emm02); avx_m256_t emm2f1 = _mm256_castsi256_ps(emm21); avx_m256_t emm2f2 = _mm256_castsi256_ps(emm22); avx_m256_t cos_emm0f1 = _mm256_castsi256_ps(cos_emm01); avx_m256_t cos_emm0f2 = _mm256_castsi256_ps(cos_emm02); avx_m256_t cos_emm2f1 = _mm256_castsi256_ps(cos_emm21); avx_m256_t cos_emm2f2 = _mm256_castsi256_ps(cos_emm22); sign_bit1 = _mm256_xor_ps(sign_bit1, emm0f1); sign_bit2 = _mm256_xor_ps(sign_bit2, emm0f2); tempa = _ps_minus_cephes_DP123; tempb = _mm256_mul_ps(y2, tempa); tempa = _mm256_mul_ps(y1, tempa); x2 = _mm256_add_ps(x2, tempb); x1 = _mm256_add_ps(x1, tempa); avx_m256_t x21 = _mm256_mul_ps(x1, x1); avx_m256_t x22 = _mm256_mul_ps(x2, x2); avx_m256_t x31 = _mm256_mul_ps(x21, x1); avx_m256_t x32 = _mm256_mul_ps(x22, x2); avx_m256_t x41 = _mm256_mul_ps(x21, x21); avx_m256_t x42 = _mm256_mul_ps(x22, x22); tempa = _ps_coscof_p0; tempb = _ps_sincof_p0; y1 = _mm256_mul_ps(x21, tempa); y2 = _mm256_mul_ps(x22, tempa); avx_m256_t y21 = _mm256_mul_ps(x21, tempb); avx_m256_t y22 = _mm256_mul_ps(x22, tempb); tempa = _ps_coscof_p1; tempb = _ps_sincof_p1; y1 = _mm256_add_ps(y1, tempa); y2 = _mm256_add_ps(y2, tempa); y21 = _mm256_add_ps(y21, tempb); y22 = _mm256_add_ps(y22, tempb); y1 = _mm256_mul_ps(y1, x21); y2 = _mm256_mul_ps(y2, x22); y21 = _mm256_mul_ps(y21, x21); y22 = _mm256_mul_ps(y22, x22); tempa = _ps_coscof_p2; tempb = _ps_sincof_p2; y1 = _mm256_add_ps(y1, tempa); y2 = _mm256_add_ps(y2, tempa); y21 = _mm256_add_ps(y21, tempb); y22 = _mm256_add_ps(y22, tempb); y1 = _mm256_mul_ps(y1, x41); y2 = _mm256_mul_ps(y2, x42); y21 = _mm256_mul_ps(y21, x31); y22 = _mm256_mul_ps(y22, x32); tempa = _ps_0p5; tempb = _ps_1; avx_m256_t temp_21 = _mm256_mul_ps(x21, tempa); avx_m256_t temp_22 = _mm256_mul_ps(x22, tempa); y21 = _mm256_add_ps(y21, x1); y22 = _mm256_add_ps(y22, x2); temp_21 = _mm256_sub_ps(temp_21, tempb); temp_22 = _mm256_sub_ps(temp_22, tempb); y1 = _mm256_sub_ps(y1, temp_21); y2 = _mm256_sub_ps(y2, temp_22); avx_m256_t cos_y1 = y1; avx_m256_t cos_y2 = y2; avx_m256_t cos_y21 = y21; avx_m256_t cos_y22 = y22; y1 = _mm256_andnot_ps(emm2f1, y1); y2 = _mm256_andnot_ps(emm2f2, y2); cos_y1 = _mm256_andnot_ps(cos_emm2f1, cos_y1); cos_y2 = _mm256_andnot_ps(cos_emm2f2, cos_y2); y21 = _mm256_and_ps(emm2f1, y21); y22 = _mm256_and_ps(emm2f2, y22); cos_y21 = _mm256_and_ps(cos_emm2f1, cos_y21); cos_y22 = _mm256_and_ps(cos_emm2f2, cos_y22); y1 = _mm256_add_ps(y1, y21); y2 = _mm256_add_ps(y2, y22); cos_y1 = _mm256_add_ps(cos_y1, cos_y21); cos_y2 = _mm256_add_ps(cos_y2, cos_y22); *s1 = _mm256_xor_ps(y1, sign_bit1); *s2 = _mm256_xor_ps(y2, sign_bit2); *c1 = _mm256_xor_ps(cos_y1, cos_emm0f1); *c2 = _mm256_xor_ps(cos_y2, cos_emm0f2); } // newsincos_ps_dual()
/* * Bitwise NOT operation for reals */ inline __m256 not_ps(const __m256 x) { static const __m256i mask = _mm256_set1_epi32(0xFFFFFFFF); return _mm256_xor_ps(CAST_INT_TO_REAL_V(mask), x); }
/* evaluation of 8 sines at onces using AVX intrisics The code is the exact rewriting of the cephes sinf function. Precision is excellent as long as x < 8192 (I did not bother to take into account the special handling they have for greater values -- it does not return garbage for arguments over 8192, though, but the extra precision is missing). Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the surprising but correct result. */ v8sf sin256_ps(v8sf x) { // any x v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y; v8si imm0, imm2; #ifndef __AVX2__ v4si imm0_1, imm0_2; v4si imm2_1, imm2_2; #endif sign_bit = x; /* take the absolute value */ x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask); /* extract the sign bit (upper one) */ sign_bit = _mm256_and_ps(sign_bit, *(v8sf*)_ps256_sign_mask); /* scale by 4/Pi */ y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI); /* Here we start a series of integer operations, which are in the realm of AVX2. If we don't have AVX, let's perform them using SSE2 directives */ #ifdef __AVX2__ /* store the integer part of y in mm0 */ imm2 = _mm256_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ // another two AVX2 instruction imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1); imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_inv1); y = _mm256_cvtepi32_ps(imm2); /* get the swap sign flag */ imm0 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_4); imm0 = _mm256_slli_epi32(imm0, 29); /* get the polynom selection mask there is one polynom for 0 <= x <= Pi/4 and another one for Pi/4<x<=Pi/2 Both branches will be computed. */ imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_2); imm2 = _mm256_cmpeq_epi32(imm2,*(v8si*)_pi32_256_0); #else /* we use SSE2 routines to perform the integer ops */ COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2); imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1); imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1); imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1); imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1); COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2); y = _mm256_cvtepi32_ps(imm2); imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4); imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4); imm0_1 = _mm_slli_epi32(imm0_1, 29); imm0_2 = _mm_slli_epi32(imm0_2, 29); COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2); imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2); imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); #endif v8sf swap_sign_bit = _mm256_castsi256_ps(imm0); v8sf poly_mask = _mm256_castsi256_ps(imm2); sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit); /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = *(v8sf*)_ps256_minus_cephes_DP1; xmm2 = *(v8sf*)_ps256_minus_cephes_DP2; xmm3 = *(v8sf*)_ps256_minus_cephes_DP3; xmm1 = _mm256_mul_ps(y, xmm1); xmm2 = _mm256_mul_ps(y, xmm2); xmm3 = _mm256_mul_ps(y, xmm3); x = _mm256_add_ps(x, xmm1); x = _mm256_add_ps(x, xmm2); x = _mm256_add_ps(x, xmm3); /* Evaluate the first polynom (0 <= x <= Pi/4) */ y = *(v8sf*)_ps256_coscof_p0; v8sf z = _mm256_mul_ps(x,x); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2); y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z); v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5); y = _mm256_sub_ps(y, tmp); y = _mm256_add_ps(y, *(v8sf*)_ps256_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ v8sf y2 = *(v8sf*)_ps256_sincof_p0; y2 = _mm256_mul_ps(y2, z); y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, x); y2 = _mm256_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; y2 = _mm256_and_ps(xmm3, y2); //, xmm3); y = _mm256_andnot_ps(xmm3, y); y = _mm256_add_ps(y,y2); /* update the sign */ y = _mm256_xor_ps(y, sign_bit); return y; }