Exemple #1
0
// SIMD abs
__SIMD _SIMD_abs_ps(__SIMD a)
{
#ifdef  USE_SSE
  return _mm_andnot_ps(_mm_set1_ps(-0.0f), a); 
#elif defined USE_AVX
  return _mm256_andnot_ps(_mm256_set1_ps(-0.0f), a); 
#elif defined USE_IBM
  return vec_abs(a);
#endif
}
Exemple #2
0
inline avx_m256_t newsin_ps(avx_m256_t x) {
	avx_m256_t sign_bit = _mm256_and_ps(x, _ps_sign_mask);
	x = _mm256_and_ps(x, _ps_inv_sign_mask);
	
	avx_m256_t y = _mm256_mul_ps(x, _ps_cephes_FOPI);

	avx_m256i_t emm2 = _mm256_cvttps_epi32(y);
	emm2 = _mm256_add_epi32(emm2, _pi32_1);
	emm2 = _mm256_and_si256(emm2, _pi32_inv1);
	y = _mm256_cvtepi32_ps(emm2);

	avx_m256i_t emm0 = _mm256_and_si256(emm2, _pi32_4);
	emm0 = _mm256_slli_epi32(emm0, 29);

	emm2 = _mm256_and_si256(emm2, _pi32_2);
	emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256());
	
	avx_m256_t swap_sign_bit = _mm256_castsi256_ps(emm0);
	avx_m256_t poly_mask = _mm256_castsi256_ps(emm2);
	sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit);
	
	avx_m256_t temp = _ps_minus_cephes_DP123;
	temp = _mm256_mul_ps(y, temp);
	x = _mm256_add_ps(x, temp);

	avx_m256_t x2 = _mm256_mul_ps(x, x);
	avx_m256_t x3 = _mm256_mul_ps(x2, x);
	avx_m256_t x4 = _mm256_mul_ps(x2, x2);

	y = _ps_coscof_p0;
	avx_m256_t y2 = _ps_sincof_p0;
	y = _mm256_mul_ps(y, x2);
	y2 = _mm256_mul_ps(y2, x2);
	y = _mm256_add_ps(y, _ps_coscof_p1);
	y2 = _mm256_add_ps(y2, _ps_sincof_p1);
	y = _mm256_mul_ps(y, x2);
	y2 = _mm256_mul_ps(y2, x2);
	y = _mm256_add_ps(y, _ps_coscof_p2);
	y2 = _mm256_add_ps(y2, _ps_sincof_p2);
	y = _mm256_mul_ps(y, x4);
	y2 = _mm256_mul_ps(y2, x3);
	temp = _mm256_mul_ps(x2, _ps_0p5);
	temp = _mm256_sub_ps(temp, _ps_1);
	y = _mm256_sub_ps(y, temp);
	y2 = _mm256_add_ps(y2, x);

	y = _mm256_andnot_ps(poly_mask, y);
	y2 = _mm256_and_ps(poly_mask, y2);
	y = _mm256_add_ps(y, y2);

	y = _mm256_xor_ps(y, sign_bit);

	return y;
} // newsin_ps()
Exemple #3
0
/*
  SIMD selection
*/
__SIMD _SIMD_sel_ps(__SIMD a, __SIMD b, void** resultPtr)
{
#ifdef  USE_SSE
  __SIMD* result = (__SIMD*) (*resultPtr);
  return _mm_or_ps(_mm_andnot_ps(*result,a),_mm_and_ps(*result,b));
#elif defined USE_AVX
  __SIMD* result = (__SIMD*) resultPtr;
  return _mm256_or_ps(_mm256_andnot_ps(*result,a),_mm256_and_ps(*result,b));
#elif defined USE_IBM
  return vec_sel(a,b,c);
#endif
}
Exemple #4
0
// Rounding half away from zero (equivalent to round() from math.h)
// __m256 contains 8 floats, but to simplify the examples, only 4 will be shown
// Initial values to be used in the examples:
// [-12.49  -0.5   1.5   3.7]
static __m256 c63_mm256_roundhalfawayfromzero_ps(const __m256 initial)
{
	const __m256 sign_mask = _mm256_set1_ps(-0.f);
	const __m256 one_half = _mm256_set1_ps(0.5f);
	const __m256 all_zeros = _mm256_setzero_ps();
	const __m256 pos_one = _mm256_set1_ps(1.f);
	const __m256 neg_one = _mm256_set1_ps(-1.f);

	// Creates a mask based on the sign of the floats, true for negative floats
	// Example: [true   true   false   false]
	__m256 less_than_zero = _mm256_cmp_ps(initial, all_zeros, _CMP_LT_OQ);

	// Returns the integer part of the floats
	// Example: [-12.0   -0.0   1.0   3.0]
	__m256 without_fraction = _mm256_round_ps(initial, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));

	// Returns the fraction part of the floats
	// Example: [-0.49   -0.5   0.5   0.7]
	__m256 fraction = _mm256_sub_ps(initial, without_fraction);

	// Absolute values of the fractions
	// Example: [0.49   0.5   0.5   0.7]
	__m256 fraction_abs = _mm256_andnot_ps(sign_mask, fraction);

	// Compares abs(fractions) to 0.5, true if lower
	// Example: [true   false   false   false]
	__m256 less_than_one_half = _mm256_cmp_ps(fraction_abs, one_half, _CMP_LT_OQ);

	// Blends 1.0 and -1.0 depending on the initial sign of the floats
	// Example: [-1.0   -1.0   1.0   1.0]
	__m256 signed_ones = _mm256_blendv_ps(pos_one, neg_one, less_than_zero);

	// Blends the previous result with zeros depending on the fractions that are lower than 0.5
	// Example: [0.0   -1.0   1.0   1.0]
	__m256 to_add = _mm256_blendv_ps(signed_ones, all_zeros, less_than_one_half);

	// Adds the previous result to the floats without fractions
	// Example: [-12.0   -1.0   2.0   4.0]
	return _mm256_add_ps(without_fraction, to_add);
}
Exemple #5
0
void static
avx_test (void)
{
  int i;
  union256 u, s1, s2;
  int source1[8]={34545, 95567, 23443, 5675, 2323, 67, 2345, 45667};
  int source2[8]={674, 57897, 93459, 45624, 54674, 1237, 67436, 79608};
  int d[8];
  int e[8];

  s1.x = _mm256_loadu_ps ((float *)source1);
  s2.x = _mm256_loadu_ps ((float *)source2);
  u.x = _mm256_andnot_ps (s1.x, s2.x);

  _mm256_storeu_ps ((float *)d, u.x);

  for (i = 0; i < 8; i++)
    e[i] = (~source1[i]) & source2[i];

  if (checkVi (d, e, 8))
    abort ();
}
/*
 * Compute the absolute value of a vector by forcing the sign bit to be zero
 */
inline __m256 fabs_ps(const __m256 x) {
    static const __m256 sign_mask = CAST_INT_TO_REAL_V(_mm256_set1_epi32(1 << 31));
    return _mm256_andnot_ps(sign_mask, x);
}
Exemple #7
0
inline void newsincos_ps_dual(avx_m256_t x1, avx_m256_t x2, avx_m256_t *s1, avx_m256_t *s2,
						avx_m256_t *c1, avx_m256_t *c2) {
	avx_m256_t tempa = _ps_sign_mask;
	avx_m256_t tempb = _ps_inv_sign_mask;
	avx_m256_t sign_bit1 = _mm256_and_ps(x1, tempa);
	avx_m256_t sign_bit2 = _mm256_and_ps(x2, tempa);
	x1 = _mm256_and_ps(x1, tempb);
	x2 = _mm256_and_ps(x2, tempb);

	tempa = _ps_cephes_FOPI;
	avx_m256_t y1 = _mm256_mul_ps(x1, tempa);
	avx_m256_t y2 = _mm256_mul_ps(x2, tempa);

	//avx_m256i_t emm21 = _mm256_cvttps_epi32(y1);
	//avx_m256i_t emm22 = _mm256_cvttps_epi32(y2);
	//emm21 = _mm256_add_epi32(emm21, _pi32_1);
	//emm22 = _mm256_add_epi32(emm22, _pi32_1);
	avx_m256i_t emm21 = _mm256_cvttps_epi32(_mm256_add_ps(y1, _ps_1));
	avx_m256i_t emm22 = _mm256_cvttps_epi32(_mm256_add_ps(y2, _ps_1));

	//emm21 = _mm256_and_si256(emm21, _pi32_inv1);
	//emm22 = _mm256_and_si256(emm22, _pi32_inv1);
	emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21), _mm256_castsi256_ps(_pi32_inv1)));
	emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22), _mm256_castsi256_ps(_pi32_inv1)));

	y1 = _mm256_cvtepi32_ps(emm21);
	y2 = _mm256_cvtepi32_ps(emm22);

	//avx_m256i_t tempia = _pi32_2;
	//avx_m256i_t cos_emm21 = _mm256_sub_epi32(emm21, tempia);
	//avx_m256i_t cos_emm22 = _mm256_sub_epi32(emm22, tempia);
	avx_m256i_t cos_emm21 = _mm256_cvtps_epi32(_mm256_sub_ps(_mm256_cvtepi32_ps(emm21), _ps_2));
	avx_m256i_t cos_emm22 = _mm256_cvtps_epi32(_mm256_sub_ps(_mm256_cvtepi32_ps(emm22), _ps_2));

	//avx_m256i_t tempib = _pi32_4;
	//avx_m256i_t emm01 = _mm256_and_si256(emm21, tempib);
	//avx_m256i_t emm02 = _mm256_and_si256(emm22, tempib);
	avx_m256i_t emm01 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21),
											_mm256_castsi256_ps(_pi32_4)));
	avx_m256i_t emm02 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22),
											_mm256_castsi256_ps(_pi32_4)));

	//avx_m256i_t cos_emm01 = _mm256_andnot_si256(cos_emm21, tempib);
	//avx_m256i_t cos_emm02 = _mm256_andnot_si256(cos_emm22, tempib);
	avx_m256i_t cos_emm01 = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(cos_emm21),
											_mm256_castsi256_ps(_pi32_4)));
	avx_m256i_t cos_emm02 = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(cos_emm22),
											_mm256_castsi256_ps(_pi32_4)));

	//emm01 = _mm256_slli_epi32(emm01, 29);
	__m128i emm0hi1 = _mm256_extractf128_si256(emm01, 0);
	__m128i emm0lo1 = _mm256_extractf128_si256(emm01, 1);
	emm0hi1 = _mm_slli_epi32(emm0hi1, 29);
	emm0lo1 = _mm_slli_epi32(emm0lo1, 29);
	emm01 = _mm256_insertf128_si256(emm01, emm0hi1, 0);
	emm01 = _mm256_insertf128_si256(emm01, emm0lo1, 1);

	//emm02 = _mm256_slli_epi32(emm02, 29);
	__m128i emm0hi2 = _mm256_extractf128_si256(emm02, 0);
	__m128i emm0lo2 = _mm256_extractf128_si256(emm02, 1);
	emm0hi2 = _mm_slli_epi32(emm0hi2, 29);
	emm0lo2 = _mm_slli_epi32(emm0lo2, 29);
	emm02 = _mm256_insertf128_si256(emm02, emm0hi1, 0);
	emm02 = _mm256_insertf128_si256(emm02, emm0lo1, 1);

	//cos_emm01 = _mm256_slli_epi32(cos_emm01, 29);
	__m128i cos_emm0hi1 = _mm256_extractf128_si256(cos_emm01, 0);
	__m128i cos_emm0lo1 = _mm256_extractf128_si256(cos_emm01, 1);
	cos_emm0hi1 = _mm_slli_epi32(cos_emm0hi1, 29);
	cos_emm0lo1 = _mm_slli_epi32(cos_emm0lo1, 29);
	cos_emm01 = _mm256_insertf128_si256(cos_emm01, cos_emm0hi1, 0);
	cos_emm01 = _mm256_insertf128_si256(cos_emm01, cos_emm0lo1, 1);

	//cos_emm02 = _mm256_slli_epi32(cos_emm02, 29);
	__m128i cos_emm0hi2 = _mm256_extractf128_si256(cos_emm02, 0);
	__m128i cos_emm0lo2 = _mm256_extractf128_si256(cos_emm02, 1);
	cos_emm0hi2 = _mm_slli_epi32(cos_emm0hi2, 29);
	cos_emm0lo2 = _mm_slli_epi32(cos_emm0lo2, 29);
	cos_emm02 = _mm256_insertf128_si256(cos_emm02, cos_emm0hi2, 0);
	cos_emm02 = _mm256_insertf128_si256(cos_emm02, cos_emm0lo2, 1);

	//tempia = _pi32_2;
	//tempib = _mm256_setzero_si256();
	//emm21 = _mm256_and_si256(emm21, tempia);
	//emm22 = _mm256_and_si256(emm22, tempia);
	emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21),
											_mm256_castsi256_ps(_pi32_2)));
	emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22),
											_mm256_castsi256_ps(_pi32_2)));

	//cos_emm21 = _mm256_and_si256(cos_emm21, tempia);
	//cos_emm22 = _mm256_and_si256(cos_emm22, tempia);
	cos_emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(cos_emm21),
											_mm256_castsi256_ps(_pi32_2)));
	cos_emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(cos_emm22),
											_mm256_castsi256_ps(_pi32_2)));

	//emm21 = _mm256_cmpeq_epi32(emm21, tempib);
	//emm22 = _mm256_cmpeq_epi32(emm22, tempib);
	emm21 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(emm21), _mm256_setzero_ps(), _CMP_EQ_UQ));
	emm22 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(emm22), _mm256_setzero_ps(), _CMP_EQ_UQ));

	//cos_emm21 = _mm256_cmpeq_epi32(cos_emm21, tempib);
	//cos_emm22 = _mm256_cmpeq_epi32(cos_emm22, tempib);
	cos_emm21 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(cos_emm21), _mm256_setzero_ps(), _CMP_EQ_UQ));
	cos_emm22 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(cos_emm22), _mm256_setzero_ps(), _CMP_EQ_UQ));
	
	avx_m256_t emm0f1 = _mm256_castsi256_ps(emm01);
	avx_m256_t emm0f2 = _mm256_castsi256_ps(emm02);
	avx_m256_t emm2f1 = _mm256_castsi256_ps(emm21);
	avx_m256_t emm2f2 = _mm256_castsi256_ps(emm22);
	avx_m256_t cos_emm0f1 = _mm256_castsi256_ps(cos_emm01);
	avx_m256_t cos_emm0f2 = _mm256_castsi256_ps(cos_emm02);
	avx_m256_t cos_emm2f1 = _mm256_castsi256_ps(cos_emm21);
	avx_m256_t cos_emm2f2 = _mm256_castsi256_ps(cos_emm22);

	sign_bit1 = _mm256_xor_ps(sign_bit1, emm0f1);
	sign_bit2 = _mm256_xor_ps(sign_bit2, emm0f2);

	tempa = _ps_minus_cephes_DP123;
	tempb = _mm256_mul_ps(y2, tempa);
	tempa = _mm256_mul_ps(y1, tempa);
	x2 = _mm256_add_ps(x2, tempb);
	x1 = _mm256_add_ps(x1, tempa);

	avx_m256_t x21 = _mm256_mul_ps(x1, x1);
	avx_m256_t x22 = _mm256_mul_ps(x2, x2);
	avx_m256_t x31 = _mm256_mul_ps(x21, x1);
	avx_m256_t x32 = _mm256_mul_ps(x22, x2);
	avx_m256_t x41 = _mm256_mul_ps(x21, x21);
	avx_m256_t x42 = _mm256_mul_ps(x22, x22);

	tempa = _ps_coscof_p0;
	tempb = _ps_sincof_p0;

	y1 = _mm256_mul_ps(x21, tempa);
	y2 = _mm256_mul_ps(x22, tempa);
	avx_m256_t y21 = _mm256_mul_ps(x21, tempb);
	avx_m256_t y22 = _mm256_mul_ps(x22, tempb);
	tempa = _ps_coscof_p1;
	tempb = _ps_sincof_p1;
	y1 = _mm256_add_ps(y1, tempa);
	y2 = _mm256_add_ps(y2, tempa);
	y21 = _mm256_add_ps(y21, tempb);
	y22 = _mm256_add_ps(y22, tempb);
	y1 = _mm256_mul_ps(y1, x21);
	y2 = _mm256_mul_ps(y2, x22);
	y21 = _mm256_mul_ps(y21, x21);
	y22 = _mm256_mul_ps(y22, x22);
	tempa = _ps_coscof_p2;
	tempb = _ps_sincof_p2;
	y1 = _mm256_add_ps(y1, tempa);
	y2 = _mm256_add_ps(y2, tempa);
	y21 = _mm256_add_ps(y21, tempb);
	y22 = _mm256_add_ps(y22, tempb);
	y1 = _mm256_mul_ps(y1, x41);
	y2 = _mm256_mul_ps(y2, x42);
	y21 = _mm256_mul_ps(y21, x31);
	y22 = _mm256_mul_ps(y22, x32);
	tempa = _ps_0p5;
	tempb = _ps_1;
	avx_m256_t temp_21 = _mm256_mul_ps(x21, tempa);
	avx_m256_t temp_22 = _mm256_mul_ps(x22, tempa);
	y21 = _mm256_add_ps(y21, x1);
	y22 = _mm256_add_ps(y22, x2);
	temp_21 = _mm256_sub_ps(temp_21, tempb);
	temp_22 = _mm256_sub_ps(temp_22, tempb);
	y1 = _mm256_sub_ps(y1, temp_21);
	y2 = _mm256_sub_ps(y2, temp_22);

	avx_m256_t cos_y1 = y1;
	avx_m256_t cos_y2 = y2;
	avx_m256_t cos_y21 = y21;
	avx_m256_t cos_y22 = y22;
	y1 = _mm256_andnot_ps(emm2f1, y1);
	y2 = _mm256_andnot_ps(emm2f2, y2);
	cos_y1 = _mm256_andnot_ps(cos_emm2f1, cos_y1);
	cos_y2 = _mm256_andnot_ps(cos_emm2f2, cos_y2);
	y21 = _mm256_and_ps(emm2f1, y21);
	y22 = _mm256_and_ps(emm2f2, y22);
	cos_y21 = _mm256_and_ps(cos_emm2f1, cos_y21);
	cos_y22 = _mm256_and_ps(cos_emm2f2, cos_y22);
	y1 = _mm256_add_ps(y1, y21);
	y2 = _mm256_add_ps(y2, y22);
	cos_y1 = _mm256_add_ps(cos_y1, cos_y21);
	cos_y2 = _mm256_add_ps(cos_y2, cos_y22);

	*s1 = _mm256_xor_ps(y1, sign_bit1);
	*s2 = _mm256_xor_ps(y2, sign_bit2);
	*c1 = _mm256_xor_ps(cos_y1, cos_emm0f1);
	*c2 = _mm256_xor_ps(cos_y2, cos_emm0f2);
} // newsincos_ps_dual()
Exemple #8
0
inline void newsincos_ps(avx_m256_t x, avx_m256_t *s, avx_m256_t *c) {
	avx_m256_t sign_bit = _mm256_and_ps(x, _ps_sign_mask);
	x = _mm256_and_ps(x, _ps_inv_sign_mask);

	avx_m256_t y = _mm256_mul_ps(x, _ps_cephes_FOPI);

	//avx_m256i_t emm2 = _mm256_cvttps_epi32(y);
	//emm2 = _mm256_add_epi32(emm2, _pi32_1);
	avx_m256i_t emm2 = _mm256_cvttps_epi32(_mm256_add_ps(y, _ps_1));

	//emm2 = _mm256_and_si256(emm2, _pi32_inv1);
	emm2 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm2), _mm256_castsi256_ps(_pi32_inv1)));

	y = _mm256_cvtepi32_ps(emm2);

	//avx_m256i_t cos_emm2 = _mm256_sub_epi32(emm2, _pi32_2);
	avx_m256i_t cos_emm2 = _mm256_cvtps_epi32(_mm256_sub_ps(_mm256_cvtepi32_ps(emm2), _ps_2));

	//avx_m256i_t emm0 = _mm256_and_si256(emm2, _pi32_4);
	avx_m256i_t emm0 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm2),
											_mm256_castsi256_ps(_pi32_4)));

	//avx_m256i_t cos_emm0 = _mm256_andnot_si256(cos_emm2, _pi32_4);
	avx_m256i_t cos_emm0 = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(cos_emm2),
											_mm256_castsi256_ps(_pi32_4)));

	//emm0 = _mm256_slli_epi32(emm0, 29);
	__m128i emm0hi = _mm256_extractf128_si256(emm0, 0);
	__m128i emm0lo = _mm256_extractf128_si256(emm0, 1);
	emm0hi = _mm_slli_epi32(emm0hi, 29);
	emm0lo = _mm_slli_epi32(emm0lo, 29);
	emm0 = _mm256_insertf128_si256(emm0, emm0hi, 0);
	emm0 = _mm256_insertf128_si256(emm0, emm0lo, 1);

	//cos_emm0 = _mm256_slli_epi32(cos_emm0, 29);
	__m128i cos_emm0hi = _mm256_extractf128_si256(cos_emm0, 0);
	__m128i cos_emm0lo = _mm256_extractf128_si256(cos_emm0, 1);
	cos_emm0hi = _mm_slli_epi32(cos_emm0hi, 29);
	cos_emm0lo = _mm_slli_epi32(cos_emm0lo, 29);
	cos_emm0 = _mm256_insertf128_si256(cos_emm0, cos_emm0hi, 0);
	cos_emm0 = _mm256_insertf128_si256(cos_emm0, cos_emm0lo, 1);

	//emm2 = _mm256_and_si256(emm2, _pi32_2);
	emm2 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm2),
											_mm256_castsi256_ps(_pi32_2)));

	//cos_emm2 = _mm256_and_si256(cos_emm2, _pi32_2);
	cos_emm2 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(cos_emm2),
											_mm256_castsi256_ps(_pi32_2)));

	//emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256());
	emm2 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(emm2), _mm256_setzero_ps(), _CMP_EQ_UQ));

	//cos_emm2 = _mm256_cmpeq_epi32(cos_emm2, _mm256_setzero_si256());
	cos_emm2 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(cos_emm2), _mm256_setzero_ps(), _CMP_EQ_UQ));

	avx_m256_t emm0f = _mm256_castsi256_ps(emm0);
	avx_m256_t emm2f = _mm256_castsi256_ps(emm2);
	avx_m256_t cos_emm0f = _mm256_castsi256_ps(cos_emm0);
	avx_m256_t cos_emm2f = _mm256_castsi256_ps(cos_emm2);

	sign_bit = _mm256_xor_ps(sign_bit, emm0f);

	avx_m256_t temp_2 = _ps_minus_cephes_DP123;
	temp_2 = _mm256_mul_ps(y, temp_2);
	x = _mm256_add_ps(x, temp_2);

	avx_m256_t x2 = _mm256_mul_ps(x, x);
	avx_m256_t x3 = _mm256_mul_ps(x2, x);
	avx_m256_t x4 = _mm256_mul_ps(x2, x2);

	y = _ps_coscof_p0;
	avx_m256_t y2 = _ps_sincof_p0;
	y = _mm256_mul_ps(y, x2);
	y2 = _mm256_mul_ps(y2, x2);
	y = _mm256_add_ps(y, _ps_coscof_p1);
	y2 = _mm256_add_ps(y2, _ps_sincof_p1);
	y = _mm256_mul_ps(y, x2);
	y2 = _mm256_mul_ps(y2, x2);
	y = _mm256_add_ps(y, _ps_coscof_p2);
	y2 = _mm256_add_ps(y2, _ps_sincof_p2);
	y = _mm256_mul_ps(y, x4);
	y2 = _mm256_mul_ps(y2, x3);
	temp_2 = _mm256_mul_ps(x2, _ps_0p5);
	y2 = _mm256_add_ps(y2, x);
	temp_2 = _mm256_sub_ps(temp_2, _ps_1);
	y = _mm256_sub_ps(y, temp_2);

	avx_m256_t cos_y = y;
	avx_m256_t cos_y2 = y2;
	y = _mm256_andnot_ps(emm2f, y);
	cos_y = _mm256_andnot_ps(cos_emm2f, cos_y);
	y2 = _mm256_and_ps(emm2f, y2);
	cos_y2 = _mm256_and_ps(cos_emm2f, cos_y2);
	y = _mm256_add_ps(y, y2);
	cos_y = _mm256_add_ps(cos_y, cos_y2);

	*s = _mm256_xor_ps(y, sign_bit);
	*c = _mm256_xor_ps(cos_y, cos_emm0f);
} // newsincos_ps()
__m256 mm256_cos_ps(__m256 x) {
  __m256 xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y;
  __m256i emm0, emm2;
  /* take the absolute value */
  x = _mm256_and_ps(x, *(__m256*)m256_ps_inv_sign_mask);
  
  /* scale by 4/Pi */
  y = _mm256_mul_ps(x, *(__m256*)m256_ps_cephes_FOPI);

  /* store the integer part of y in mm0 */
  emm2 = _mm256_cvttps_epi32(y);
  /* j=(j+1) & (~1) (see the cephes sources) */
  emm2 = _mm256_add_epi32(emm2, *(__m256i*)m256_pi32_1);
  emm2 = _mm256_and_si256(emm2, *(__m256i*)m256_pi32_inv1);
  y = _mm256_cvtepi32_ps(emm2);

  emm2 = _mm256_sub_epi32(emm2, *(__m256i*)m256_pi32_2);
  
  /* get the swap sign flag */
  emm0 = _mm256_andnot_si256(emm2, *(__m256i*)m256_pi32_4);
  emm0 = _mm256_slli_epi32(emm0, 29);
  /* get the polynom selection mask */
  emm2 = _mm256_and_si256(emm2, *(__m256i*)m256_pi32_2);
  emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256());
  
  __m256 sign_bit = _mm256_castsi256_ps(emm0);
  __m256 poly_mask = _mm256_castsi256_ps(emm2);

  /* The magic pass: "******" 
     x = ((x - y * DP1) - y * DP2) - y * DP3; */
  xmm1 = *(__m256*)m256_ps_minus_cephes_DP1;
  xmm2 = *(__m256*)m256_ps_minus_cephes_DP2;
  xmm3 = *(__m256*)m256_ps_minus_cephes_DP3;
  xmm1 = _mm256_mul_ps(y, xmm1);
  xmm2 = _mm256_mul_ps(y, xmm2);
  xmm3 = _mm256_mul_ps(y, xmm3);
  x = _mm256_add_ps(x, xmm1);
  x = _mm256_add_ps(x, xmm2);
  x = _mm256_add_ps(x, xmm3);
  
  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
  y = *(__m256*)m256_ps_coscof_p0;
  __m256 z = _mm256_mul_ps(x,x);

  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, *(__m256*)m256_ps_coscof_p1);
  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, *(__m256*)m256_ps_coscof_p2);
  y = _mm256_mul_ps(y, z);
  y = _mm256_mul_ps(y, z);
  __m256 tmp = _mm256_mul_ps(z, *(__m256*)m256_ps_0p5);
  y = _mm256_sub_ps(y, tmp);
  y = _mm256_add_ps(y, *(__m256*)m256_ps_1);
  
  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

  __m256 y2 = *(__m256*)m256_ps_sincof_p0;
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_add_ps(y2, *(__m256*)m256_ps_sincof_p1);
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_add_ps(y2, *(__m256*)m256_ps_sincof_p2);
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_mul_ps(y2, x);
  y2 = _mm256_add_ps(y2, x);

  /* select the correct result from the two polynoms */  
  xmm3 = poly_mask;
  y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
  y = _mm256_andnot_ps(xmm3, y);
  y = _mm256_add_ps(y,y2);
  /* update the sign */
  y = _mm256_xor_ps(y, sign_bit);

  _mm256_zeroupper();
  return y;
}
/* since sin256_ps and cos256_ps are almost identical, sincos256_ps could replace both of them..
   it is almost as fast, and gives you a free cosine with your sine */
void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {

  v8sf xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y;
  v8si imm0, imm2, imm4;

#ifndef __AVX2__
  v4si imm0_1, imm0_2;
  v4si imm2_1, imm2_2;
  v4si imm4_1, imm4_2;
#endif

  sign_bit_sin = x;
  /* take the absolute value */
  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
  /* extract the sign bit (upper one) */
  sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf*)_ps256_sign_mask);
  
  /* scale by 4/Pi */
  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);

#ifdef __AVX2__    
  /* store the integer part of y in imm2 */
  imm2 = _mm256_cvttps_epi32(y);

  /* j=(j+1) & (~1) (see the cephes sources) */
  imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
  imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_inv1);

  y = _mm256_cvtepi32_ps(imm2);
  imm4 = imm2;

  /* get the swap sign flag for the sine */
  imm0 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_4);
  imm0 = _mm256_slli_epi32(imm0, 29);
  //v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);

  /* get the polynom selection mask for the sine*/
  imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_2);
  imm2 = _mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0);
  //v8sf poly_mask = _mm256_castsi256_ps(imm2);
#else
  /* we use SSE2 routines to perform the integer ops */
  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);

  imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
  imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
  
  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);

  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
  y = _mm256_cvtepi32_ps(imm2);

  imm4_1 = imm2_1;
  imm4_2 = imm2_2;

  imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4);
  imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4);
  
  imm0_1 = _mm_slli_epi32(imm0_1, 29);
  imm0_2 = _mm_slli_epi32(imm0_2, 29);

  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);

  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);

  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());

  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
#endif
  v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
  v8sf poly_mask = _mm256_castsi256_ps(imm2);

  /* The magic pass: "******" 
     x = ((x - y * DP1) - y * DP2) - y * DP3; */
  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
  xmm1 = _mm256_mul_ps(y, xmm1);
  xmm2 = _mm256_mul_ps(y, xmm2);
  xmm3 = _mm256_mul_ps(y, xmm3);
  x = _mm256_add_ps(x, xmm1);
  x = _mm256_add_ps(x, xmm2);
  x = _mm256_add_ps(x, xmm3);

#ifdef __AVX2__
  imm4 = _mm256_sub_epi32(imm4, *(v8si*)_pi32_256_2);
  imm4 = _mm256_andnot_si128(imm4, *(v8si*)_pi32_256_4);
  imm4 = _mm256_slli_epi32(imm4, 29);
#else
  imm4_1 = _mm_sub_epi32(imm4_1, *(v4si*)_pi32avx_2);
  imm4_2 = _mm_sub_epi32(imm4_2, *(v4si*)_pi32avx_2);

  imm4_1 = _mm_andnot_si128(imm4_1, *(v4si*)_pi32avx_4);
  imm4_2 = _mm_andnot_si128(imm4_2, *(v4si*)_pi32avx_4);
  
  imm4_1 = _mm_slli_epi32(imm4_1, 29);
  imm4_2 = _mm_slli_epi32(imm4_2, 29);

  COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4);
#endif

  v8sf sign_bit_cos = _mm256_castsi256_ps(imm4);

  sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
  
  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
  v8sf z = _mm256_mul_ps(x,x);
  y = *(v8sf*)_ps256_coscof_p0;

  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
  y = _mm256_mul_ps(y, z);
  y = _mm256_mul_ps(y, z);
  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
  y = _mm256_sub_ps(y, tmp);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
  
  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_mul_ps(y2, x);
  y2 = _mm256_add_ps(y2, x);

  /* select the correct result from the two polynoms */  
  xmm3 = poly_mask;
  v8sf ysin2 = _mm256_and_ps(xmm3, y2);
  v8sf ysin1 = _mm256_andnot_ps(xmm3, y);
  y2 = _mm256_sub_ps(y2,ysin2);
  y = _mm256_sub_ps(y, ysin1);

  xmm1 = _mm256_add_ps(ysin1,ysin2);
  xmm2 = _mm256_add_ps(y,y2);
 
  /* update the sign */
  *s = _mm256_xor_ps(xmm1, sign_bit_sin);
  *c = _mm256_xor_ps(xmm2, sign_bit_cos);
}
/* evaluation of 8 sines at onces using AVX intrisics

   The code is the exact rewriting of the cephes sinf function.
   Precision is excellent as long as x < 8192 (I did not bother to
   take into account the special handling they have for greater values
   -- it does not return garbage for arguments over 8192, though, but
   the extra precision is missing).

   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
   surprising but correct result.

*/
v8sf sin256_ps(v8sf x) { // any x
  v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
  v8si imm0, imm2;

#ifndef __AVX2__
  v4si imm0_1, imm0_2;
  v4si imm2_1, imm2_2;
#endif

  sign_bit = x;
  /* take the absolute value */
  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
  /* extract the sign bit (upper one) */
  sign_bit = _mm256_and_ps(sign_bit, *(v8sf*)_ps256_sign_mask);
  
  /* scale by 4/Pi */
  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);

  /*
    Here we start a series of integer operations, which are in the
    realm of AVX2.
    If we don't have AVX, let's perform them using SSE2 directives
  */

#ifdef __AVX2__
  /* store the integer part of y in mm0 */
  imm2 = _mm256_cvttps_epi32(y);
  /* j=(j+1) & (~1) (see the cephes sources) */
  // another two AVX2 instruction
  imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
  imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_inv1);
  y = _mm256_cvtepi32_ps(imm2);

  /* get the swap sign flag */
  imm0 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_4);
  imm0 = _mm256_slli_epi32(imm0, 29);
  /* get the polynom selection mask 
     there is one polynom for 0 <= x <= Pi/4
     and another one for Pi/4<x<=Pi/2

     Both branches will be computed.
  */
  imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_2);
  imm2 = _mm256_cmpeq_epi32(imm2,*(v8si*)_pi32_256_0);
#else
  /* we use SSE2 routines to perform the integer ops */
  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);

  imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
  imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);

  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);

  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
  y = _mm256_cvtepi32_ps(imm2);

  imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4);
  imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4);

  imm0_1 = _mm_slli_epi32(imm0_1, 29);
  imm0_2 = _mm_slli_epi32(imm0_2, 29);

  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);

  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);

  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());

  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
#endif
 
  v8sf swap_sign_bit = _mm256_castsi256_ps(imm0);
  v8sf poly_mask = _mm256_castsi256_ps(imm2);
  sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit);

  /* The magic pass: "******" 
     x = ((x - y * DP1) - y * DP2) - y * DP3; */
  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
  xmm1 = _mm256_mul_ps(y, xmm1);
  xmm2 = _mm256_mul_ps(y, xmm2);
  xmm3 = _mm256_mul_ps(y, xmm3);
  x = _mm256_add_ps(x, xmm1);
  x = _mm256_add_ps(x, xmm2);
  x = _mm256_add_ps(x, xmm3);

  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
  y = *(v8sf*)_ps256_coscof_p0;
  v8sf z = _mm256_mul_ps(x,x);

  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
  y = _mm256_mul_ps(y, z);
  y = _mm256_mul_ps(y, z);
  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
  y = _mm256_sub_ps(y, tmp);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
  
  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_mul_ps(y2, x);
  y2 = _mm256_add_ps(y2, x);

  /* select the correct result from the two polynoms */  
  xmm3 = poly_mask;
  y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
  y = _mm256_andnot_ps(xmm3, y);
  y = _mm256_add_ps(y,y2);
  /* update the sign */
  y = _mm256_xor_ps(y, sign_bit);

  return y;
}