Esempio n. 1
0
__m256 mm256_exp_ps(__m256 x) {
  __m256 tmp = _mm256_setzero_ps(), fx;
  __m256i emm0;
  __m256 one = *(__m256*)m256_ps_1;

  x = _mm256_min_ps(x, *(__m256*)m256_ps_exp_hi);
  x = _mm256_max_ps(x, *(__m256*)m256_ps_exp_lo);

  /* express exp(x) as exp(g + n*log(2)) */
  fx = _mm256_mul_ps(x, *(__m256*)m256_ps_cephes_LOG2EF);
  fx = _mm256_add_ps(fx, *(__m256*)m256_ps_0p5);

  /* how to perform a floorf with SSE: just below */
  /* step 1 : cast to int */
  emm0 = _mm256_cvttps_epi32(fx);
  /* step 2 : cast back to float */
  tmp  = _mm256_cvtepi32_ps(emm0);

  /* if greater, substract 1 */
  __m256 mask = _mm256_cmp_ps( tmp, fx, _CMP_GT_OS );
  mask = _mm256_and_ps(mask, one);
  fx = _mm256_sub_ps(tmp, mask);

  tmp = _mm256_mul_ps(fx, *(__m256*)m256_ps_cephes_exp_C1);
  __m256 z = _mm256_mul_ps(fx, *(__m256*)m256_ps_cephes_exp_C2);
  x = _mm256_sub_ps(x, tmp);
  x = _mm256_sub_ps(x, z);

  z = _mm256_mul_ps(x,x);
  
  __m256 y = *(__m256*)m256_ps_cephes_exp_p0;
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p1);
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p2);
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p3);
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p4);
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p5);
  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, x);
  y = _mm256_add_ps(y, one);

  /* build 2^n */
  emm0 = _mm256_cvttps_epi32(fx);
  emm0 = _mm256_add_epi32(emm0, *(__m256i*)m256_pi32_0x7f);
  emm0 = _mm256_slli_epi32(emm0, 23);
  __m256 pow2n = _mm256_castsi256_ps(emm0);

  y = _mm256_mul_ps(y, pow2n);
  _mm256_zeroupper();
  return y;
}
Esempio n. 2
0
v8sf exp256_ps(v8sf x) {
  v8sf tmp = _mm256_setzero_ps(), fx;
  v8si imm0;
  v8sf one = *(v8sf*)_ps256_1;

  x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi);
  x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo);

  /* express exp(x) as exp(g + n*log(2)) */
  fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF);
  fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5);

  /* how to perform a floorf with SSE: just below */
  //imm0 = _mm256_cvttps_epi32(fx);
  //tmp  = _mm256_cvtepi32_ps(imm0);
  
  tmp = _mm256_floor_ps(fx);

  /* if greater, substract 1 */
  //v8sf mask = _mm256_cmpgt_ps(tmp, fx);    
  v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);    
  mask = _mm256_and_ps(mask, one);
  fx = _mm256_sub_ps(tmp, mask);

  tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1);
  v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2);
  x = _mm256_sub_ps(x, tmp);
  x = _mm256_sub_ps(x, z);

  z = _mm256_mul_ps(x,x);
  
  v8sf y = *(v8sf*)_ps256_cephes_exp_p0;
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1);
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2);
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3);
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4);
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5);
  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, x);
  y = _mm256_add_ps(y, one);

  /* build 2^n */
  imm0 = _mm256_cvttps_epi32(fx);
  // another two AVX2 instructions
  imm0 = _mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f);
  imm0 = _mm256_slli_epi32(imm0, 23);
  v8sf pow2n = _mm256_castsi256_ps(imm0);
  y = _mm256_mul_ps(y, pow2n);
  return y;
}
Esempio n. 3
0
inline avx_m256_t newsin_ps(avx_m256_t x) {
	avx_m256_t sign_bit = _mm256_and_ps(x, _ps_sign_mask);
	x = _mm256_and_ps(x, _ps_inv_sign_mask);
	
	avx_m256_t y = _mm256_mul_ps(x, _ps_cephes_FOPI);

	avx_m256i_t emm2 = _mm256_cvttps_epi32(y);
	emm2 = _mm256_add_epi32(emm2, _pi32_1);
	emm2 = _mm256_and_si256(emm2, _pi32_inv1);
	y = _mm256_cvtepi32_ps(emm2);

	avx_m256i_t emm0 = _mm256_and_si256(emm2, _pi32_4);
	emm0 = _mm256_slli_epi32(emm0, 29);

	emm2 = _mm256_and_si256(emm2, _pi32_2);
	emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256());
	
	avx_m256_t swap_sign_bit = _mm256_castsi256_ps(emm0);
	avx_m256_t poly_mask = _mm256_castsi256_ps(emm2);
	sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit);
	
	avx_m256_t temp = _ps_minus_cephes_DP123;
	temp = _mm256_mul_ps(y, temp);
	x = _mm256_add_ps(x, temp);

	avx_m256_t x2 = _mm256_mul_ps(x, x);
	avx_m256_t x3 = _mm256_mul_ps(x2, x);
	avx_m256_t x4 = _mm256_mul_ps(x2, x2);

	y = _ps_coscof_p0;
	avx_m256_t y2 = _ps_sincof_p0;
	y = _mm256_mul_ps(y, x2);
	y2 = _mm256_mul_ps(y2, x2);
	y = _mm256_add_ps(y, _ps_coscof_p1);
	y2 = _mm256_add_ps(y2, _ps_sincof_p1);
	y = _mm256_mul_ps(y, x2);
	y2 = _mm256_mul_ps(y2, x2);
	y = _mm256_add_ps(y, _ps_coscof_p2);
	y2 = _mm256_add_ps(y2, _ps_sincof_p2);
	y = _mm256_mul_ps(y, x4);
	y2 = _mm256_mul_ps(y2, x3);
	temp = _mm256_mul_ps(x2, _ps_0p5);
	temp = _mm256_sub_ps(temp, _ps_1);
	y = _mm256_sub_ps(y, temp);
	y2 = _mm256_add_ps(y2, x);

	y = _mm256_andnot_ps(poly_mask, y);
	y2 = _mm256_and_ps(poly_mask, y2);
	y = _mm256_add_ps(y, y2);

	y = _mm256_xor_ps(y, sign_bit);

	return y;
} // newsin_ps()
Esempio n. 4
0
void static
avx_test (void)
{
  int i;
  union256 s1;
  union256i_d u;
  int e [8];

  s1.x = _mm256_set_ps (45.64, 4564.56, 2.3, 5.5, 57.57, 89.34, 54.12, 954.67);
  u.x = _mm256_cvttps_epi32 (s1.x);

  for (i = 0; i < 8; i++)
    e[i] = (int)s1.a[i];

  if (check_union256i_d (u, e))
    abort ();
}
    __m256 _inner_mm256_exp_ps1(__m256 arg)
    {
        arg = _mm256_mul_ps(arg, _mm256_set1_ps(1.4426950408889634073599246810018921374266459541529859f));

        __m256i e = _mm256_add_epi32(
            _mm256_castps_si256(_mm256_cmp_ps(arg, _mm256_set1_ps(0.0f), _CMP_LT_OQ)),
            _mm256_cvttps_epi32(arg));

        arg = _mm256_sub_ps(arg, _mm256_cvtepi32_ps(e));

        __m256 intermediate_result;
        intermediate_result = _mm256_fmadd_ps(_mm256_set1_ps(0.0136779459179717f), arg, _mm256_set1_ps(0.0517692205767896f));
        intermediate_result = _mm256_fmadd_ps(intermediate_result, arg, _mm256_set1_ps(0.241554388295527f));
        intermediate_result = _mm256_fmadd_ps(intermediate_result, arg, _mm256_set1_ps(0.692998430056128f));
        intermediate_result = _mm256_fmadd_ps(intermediate_result, arg, _mm256_set1_ps(0.999999804292074f));
        arg = intermediate_result;

        __m256 res = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_add_epi32(e, _mm256_set1_epi32(127)), 23));

        res = _mm256_mul_ps(res, arg);

        return res;
    }
Esempio n. 6
0
    inline void Sort4Deg6(__m256 llrI, int pos[], int ipos[])
    {
        int llr[8] __attribute__((aligned(64)));
        const auto v1 = _mm256_set1_ps( 67108864.0f );
        const auto v2 = _mm256_mul_ps( v1, llrI );
        _mm256_store_si256((__m256i *)llr,  _mm256_cvttps_epi32(v2));

        //register float x0,x1,x2,x3,x4,x5;
        const auto x0 = llr[0];
        const auto x1 = llr[1];
        const auto x2 = llr[2];
        const auto x3 = llr[3];
        const auto x4 = llr[4];
        const auto x5 = llr[5];
        int o0 = (x0<x1) +(x0<x2)+(x0<x3)+(x0<x4)+(x0<x5);
        int o1 = (x1<=x0)+(x1<x2)+(x1<x3)+(x1<x4)+(x1<x5);
        int o2 = (x2<=x0)+(x2<=x1)+(x2<x3)+(x2<x4)+(x2<x5);
        int o3 = (x3<=x0)+(x3<=x1)+(x3<=x2)+(x3<x4)+(x3<x5);
        int o4 = (x4<=x0)+(x4<=x1)+(x4<=x2)+(x4<=x3)+(x4<x5);
        int o5 = 15-(o0+o1+o2+o3+o4);
        pos[o0] =  0;  pos[o1]= 1;  pos[o2]= 2;  pos[o3]= 3;  pos[o4]= 4;  pos[o5]= 5;  pos[6]=6;  pos[7]=7;
        ipos[ 0] = o0; ipos[ 1]=o1; ipos[ 2]=o2; ipos[ 3]=o3; ipos[ 4]=o4; ipos[ 5]=o5; ipos[6]=6; ipos[7]=7;
    }
Esempio n. 7
0
  {
    template<class Sig> struct result;
    template<class This,class A0>
    struct result<This(A0)>
      { typedef typename meta::as_integer<A0>::type type; };

    NT2_FUNCTOR_CALL_DISPATCH(
      1,
      typename nt2::meta::scalar_of<A0>::type,
      (3, (float,double,arithmetic_))
    )

    NT2_FUNCTOR_CALL_EVAL_IF(1,  float)
    {
      typedef typename NT2_CALL_RETURN_TYPE(1)::type type;
      type that =  {_mm256_cvttps_epi32(a0)};
      return  that; 
    }
    NT2_FUNCTOR_CALL_EVAL_IF(1, double)
    {
      typedef typename NT2_CALL_RETURN_TYPE(1)::type type;
      const type v = {{a0[0],a0[1], a0[2],a0[3]}}; //TODO with _mm_cvttpd_epi32 
      return v;
    }
    NT2_FUNCTOR_CALL_EVAL_IF(1, arithmetic_)
    {
      return a0;
    }
  };
} }
Esempio n. 8
0
void run_dct(int width, int height, float *quant, float *input, int32_t *output)
{
  float acosvals[8][8];

  /* Calculating cosines is expensive, and there
   * are only 64 cosines that need to be calculated
   * so precompute them and cache. */
  for (int i = 0; i < 8; i++)
  {
    for (int j = 0; j < 8; j++)
    {
      if (j == 0)
      {
        acosvals[i][j] = sqrt(1.0 / 8.0) * cos(PI / 8.0 * (i + 0.5d) * j);
      }
      else
      {
        acosvals[i][j] = 0.5 * cos(PI / 8.0 * (i + 0.5d) * j);
      }
    }
  }

/* Separate the parallel from the for, so each processor gets its
   * own copy of the buffers and variables. */
#pragma omp parallel
  {
    float avload[8] = {0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5};
    avload[0] = sqrt(1.0 / 8.0);
    __m256 row0, row1, row2, row3, row4, row5, row6, row7;
    __m256 loaderlow, loaderhigh;
    __m256 temp;
    __m256 minus128 = _mm256_set1_ps(-128.0);
    __m256 avxcosloader, avxcos;
    float avxcosmover;
    __m256i integer;

/* The DCT breaks the image into 8 by 8 blocks and then
   * transforms them into color frequencies. */
#pragma omp for
    for (int brow = 0; brow < height / 8; brow++)
    {
      for (int bcol = 0; bcol < width / 8; bcol++)
      {
        int head_pointer = bcol * 8 + brow * 8 * width;
        row0 = _mm256_setzero_ps();
        row1 = _mm256_setzero_ps();
        row2 = _mm256_setzero_ps();
        row3 = _mm256_setzero_ps();
        row4 = _mm256_setzero_ps();
        row5 = _mm256_setzero_ps();
        row6 = _mm256_setzero_ps();
        row7 = _mm256_setzero_ps();

        /* This pair of loops uses AVX instuctions to add the frequency
       * component from each pixel to all of the buckets at once.  Allows
       * us to do the DCT on a block in 64 iterations of a loop rather
       * than 64 iterations of 64 iterations of a loop (all 64 pixels affect
       * all 64 frequencies) */
        for (int x = 0; x < 8; x++)
        {
          for (int y = 0; y < 4; y++)
          {
            loaderlow = _mm256_broadcast_ss(&input[head_pointer + x + (y * width)]);
            loaderlow = _mm256_add_ps(loaderlow, minus128);
            loaderhigh = _mm256_broadcast_ss(&input[head_pointer + x + ((7 - y) * width)]);
            loaderhigh = _mm256_add_ps(loaderhigh, minus128);

            avxcos = _mm256_loadu_ps(&acosvals[x][0]);
            loaderlow = _mm256_mul_ps(loaderlow, avxcos);
            loaderhigh = _mm256_mul_ps(loaderhigh, avxcos);

            avxcosloader = _mm256_loadu_ps(&acosvals[y][0]);

            avxcosmover = avxcosloader[0];
            avxcos = _mm256_set1_ps(avxcosmover);
            temp = _mm256_mul_ps(loaderlow, avxcos);
            row0 = _mm256_add_ps(row0, temp);
            temp = _mm256_mul_ps(loaderhigh, avxcos);
            row0 = _mm256_add_ps(row0, temp);

            avxcosmover = avxcosloader[1];
            avxcos = _mm256_set1_ps(avxcosmover);
            temp = _mm256_mul_ps(loaderlow, avxcos);
            row1 = _mm256_add_ps(row1, temp);
            temp = _mm256_mul_ps(loaderhigh, avxcos);
            row1 = _mm256_sub_ps(row1, temp);

            avxcosmover = avxcosloader[2];
            avxcos = _mm256_set1_ps(avxcosmover);
            temp = _mm256_mul_ps(loaderlow, avxcos);
            row2 = _mm256_add_ps(row2, temp);
            temp = _mm256_mul_ps(loaderhigh, avxcos);
            row2 = _mm256_add_ps(row2, temp);

            avxcosmover = avxcosloader[3];
            avxcos = _mm256_set1_ps(avxcosmover);
            temp = _mm256_mul_ps(loaderlow, avxcos);
            row3 = _mm256_add_ps(row3, temp);
            temp = _mm256_mul_ps(loaderhigh, avxcos);
            row3 = _mm256_sub_ps(row3, temp);

            avxcosmover = avxcosloader[4];
            avxcos = _mm256_set1_ps(avxcosmover);
            temp = _mm256_mul_ps(loaderlow, avxcos);
            row4 = _mm256_add_ps(row4, temp);
            temp = _mm256_mul_ps(loaderhigh, avxcos);
            row4 = _mm256_add_ps(row4, temp);

            avxcosmover = avxcosloader[5];
            avxcos = _mm256_set1_ps(avxcosmover);
            temp = _mm256_mul_ps(loaderlow, avxcos);
            row5 = _mm256_add_ps(row5, temp);
            temp = _mm256_mul_ps(loaderhigh, avxcos);
            row5 = _mm256_sub_ps(row5, temp);

            avxcosmover = avxcosloader[6];
            avxcos = _mm256_set1_ps(avxcosmover);
            temp = _mm256_mul_ps(loaderlow, avxcos);
            row6 = _mm256_add_ps(row6, temp);
            temp = _mm256_mul_ps(loaderhigh, avxcos);
            row6 = _mm256_add_ps(row6, temp);

            avxcosmover = avxcosloader[7];
            avxcos = _mm256_set1_ps(avxcosmover);
            temp = _mm256_mul_ps(loaderlow, avxcos);
            row7 = _mm256_add_ps(row7, temp);
            temp = _mm256_mul_ps(loaderhigh, avxcos);
            row7 = _mm256_sub_ps(row7, temp);
          }
        }

        /* Each frequency stored as a float needs to be divided by
       * the quantization value, then rounded to the nearest integer.
       * Also changes the order of the values from pixel order to
       * each 8 by 8 block stored one after another. */
        temp = _mm256_loadu_ps(&quant[0]);
        row0 = _mm256_div_ps(row0, temp);
        row0 = _mm256_round_ps(row0, _MM_FROUND_TO_NEAREST_INT);
        integer = _mm256_cvttps_epi32(row0);
        _mm256_storeu_si256(output + (bcol + brow * (width / 8)) * 64, integer);

        temp = _mm256_loadu_ps(&quant[8]);
        row1 = _mm256_div_ps(row1, temp);
        row1 = _mm256_round_ps(row1, _MM_FROUND_TO_NEAREST_INT);
        integer = _mm256_cvttps_epi32(row1);
        _mm256_storeu_si256(output + 8 + (bcol + brow * (width / 8)) * 64, integer);

        temp = _mm256_loadu_ps(&quant[16]);
        row2 = _mm256_div_ps(row2, temp);
        row2 = _mm256_round_ps(row2, _MM_FROUND_TO_NEAREST_INT);
        integer = _mm256_cvttps_epi32(row2);
        _mm256_storeu_si256(output + 16 + (bcol + brow * (width / 8)) * 64, integer);

        temp = _mm256_loadu_ps(&quant[24]);
        row3 = _mm256_div_ps(row3, temp);
        row3 = _mm256_round_ps(row3, _MM_FROUND_TO_NEAREST_INT);
        integer = _mm256_cvttps_epi32(row3);
        _mm256_storeu_si256(output + 24 + (bcol + brow * (width / 8)) * 64, integer);

        temp = _mm256_loadu_ps(&quant[32]);
        row4 = _mm256_div_ps(row4, temp);
        row4 = _mm256_round_ps(row4, _MM_FROUND_TO_NEAREST_INT);
        integer = _mm256_cvttps_epi32(row4);
        _mm256_storeu_si256(output + 32 + (bcol + brow * (width / 8)) * 64, integer);

        temp = _mm256_loadu_ps(&quant[40]);
        row5 = _mm256_div_ps(row5, temp);
        row5 = _mm256_round_ps(row5, _MM_FROUND_TO_NEAREST_INT);
        integer = _mm256_cvttps_epi32(row5);
        _mm256_storeu_si256(output + 40 + (bcol + brow * (width / 8)) * 64, integer);

        temp = _mm256_loadu_ps(&quant[48]);
        row6 = _mm256_div_ps(row6, temp);
        row6 = _mm256_round_ps(row6, _MM_FROUND_TO_NEAREST_INT);
        integer = _mm256_cvttps_epi32(row6);
        _mm256_storeu_si256(output + 48 + (bcol + brow * (width / 8)) * 64, integer);

        temp = _mm256_loadu_ps(&quant[56]);
        row7 = _mm256_div_ps(row7, temp);
        row7 = _mm256_round_ps(row7, _MM_FROUND_TO_NEAREST_INT);
        integer = _mm256_cvttps_epi32(row7);
        _mm256_storeu_si256(output + 56 + (bcol + brow * (width / 8)) * 64, integer);
      }
    }
  }
}
Esempio n. 9
0
inline avx_m256_t newexp_ps(avx_m256_t x) {
	avx_m256_t one = _ps_1;
	avx_m256_t zero = _ps_0;

	x = _mm256_min_ps(x, _ps_exp_hi);
	x = _mm256_max_ps(x, _ps_exp_lo);

	avx_m256_t temp_2 = _mm256_mul_ps(x, _ps_cephes_LOG2EF);
	temp_2 = _mm256_add_ps(temp_2, _ps_0p5);

	avx_m256i_t emm0 = _mm256_cvttps_epi32(temp_2);
	avx_m256_t temp_1 = _mm256_cvtepi32_ps(emm0);
	avx_m256_t temp_3 = _mm256_sub_ps(temp_1, temp_2);
	avx_m256_t mask = _mm256_cmp_ps(temp_3, zero, _CMP_GT_OQ);

	mask = _mm256_and_ps(mask, one);
	temp_2 = _mm256_sub_ps(temp_1, mask);
	emm0 = _mm256_cvttps_epi32(temp_2);

	temp_1 = _mm256_mul_ps(temp_2, _ps_cephes_exp_C12);
	x = _mm256_sub_ps(x, temp_1);

	avx_m256_t x2 = _mm256_mul_ps(x, x);
	avx_m256_t x3 = _mm256_mul_ps(x2, x);
	avx_m256_t x4 = _mm256_mul_ps(x2, x2);
 
	temp_1 = _mm256_add_ps(x, one);
	temp_2 = _mm256_mul_ps(x2, _ps_cephes_exp_p5);
	temp_3 = _mm256_mul_ps(x3, _ps_cephes_exp_p4);
	temp_1 = _mm256_add_ps(temp_1, temp_2);

	temp_2 = _mm256_mul_ps(x3, _ps_cephes_exp_p0);

	temp_1 = _mm256_add_ps(temp_1, temp_3);

	avx_m256_t temp_4 = _mm256_mul_ps(x, _ps_cephes_exp_p2);
	temp_3 = _mm256_mul_ps(x2, _ps_cephes_exp_p1);

	emm0 = _mm256_castps_si256(_mm256_add_ps(_mm256_castsi256_ps(emm0), _mm256_castsi256_ps(_pi32_0x7f)));

	temp_2 = _mm256_add_ps(temp_2, temp_3);
	temp_3 = _mm256_add_ps(temp_3, temp_4);

	//emm0 = _mm256_slli_epi32(emm0, 23);
	// convert emm0 into two 128-bit integer vectors
	// perform shift on both vectors
	// combine both vectors into 256-bit emm0
	__m128i emm0hi = _mm256_extractf128_si256(emm0, 0);
	__m128i emm0lo = _mm256_extractf128_si256(emm0, 1);
	emm0hi = _mm_slli_epi32(emm0hi, 23);
	emm0lo = _mm_slli_epi32(emm0lo, 23);
	emm0 = _mm256_insertf128_si256(emm0, emm0hi, 0);
	emm0 = _mm256_insertf128_si256(emm0, emm0lo, 1);

	avx_m256_t pow2n = _mm256_castsi256_ps(emm0);

	temp_2 = _mm256_add_ps(temp_2, temp_3);
	temp_2 = _mm256_mul_ps(temp_2, x4);

	avx_m256_t y = _mm256_add_ps(temp_1, temp_2);

	y = _mm256_mul_ps(y, pow2n);
	return y;
} // newexp_ps()
Esempio n. 10
0
inline void newsincos_ps_dual(avx_m256_t x1, avx_m256_t x2, avx_m256_t *s1, avx_m256_t *s2,
						avx_m256_t *c1, avx_m256_t *c2) {
	avx_m256_t tempa = _ps_sign_mask;
	avx_m256_t tempb = _ps_inv_sign_mask;
	avx_m256_t sign_bit1 = _mm256_and_ps(x1, tempa);
	avx_m256_t sign_bit2 = _mm256_and_ps(x2, tempa);
	x1 = _mm256_and_ps(x1, tempb);
	x2 = _mm256_and_ps(x2, tempb);

	tempa = _ps_cephes_FOPI;
	avx_m256_t y1 = _mm256_mul_ps(x1, tempa);
	avx_m256_t y2 = _mm256_mul_ps(x2, tempa);

	//avx_m256i_t emm21 = _mm256_cvttps_epi32(y1);
	//avx_m256i_t emm22 = _mm256_cvttps_epi32(y2);
	//emm21 = _mm256_add_epi32(emm21, _pi32_1);
	//emm22 = _mm256_add_epi32(emm22, _pi32_1);
	avx_m256i_t emm21 = _mm256_cvttps_epi32(_mm256_add_ps(y1, _ps_1));
	avx_m256i_t emm22 = _mm256_cvttps_epi32(_mm256_add_ps(y2, _ps_1));

	//emm21 = _mm256_and_si256(emm21, _pi32_inv1);
	//emm22 = _mm256_and_si256(emm22, _pi32_inv1);
	emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21), _mm256_castsi256_ps(_pi32_inv1)));
	emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22), _mm256_castsi256_ps(_pi32_inv1)));

	y1 = _mm256_cvtepi32_ps(emm21);
	y2 = _mm256_cvtepi32_ps(emm22);

	//avx_m256i_t tempia = _pi32_2;
	//avx_m256i_t cos_emm21 = _mm256_sub_epi32(emm21, tempia);
	//avx_m256i_t cos_emm22 = _mm256_sub_epi32(emm22, tempia);
	avx_m256i_t cos_emm21 = _mm256_cvtps_epi32(_mm256_sub_ps(_mm256_cvtepi32_ps(emm21), _ps_2));
	avx_m256i_t cos_emm22 = _mm256_cvtps_epi32(_mm256_sub_ps(_mm256_cvtepi32_ps(emm22), _ps_2));

	//avx_m256i_t tempib = _pi32_4;
	//avx_m256i_t emm01 = _mm256_and_si256(emm21, tempib);
	//avx_m256i_t emm02 = _mm256_and_si256(emm22, tempib);
	avx_m256i_t emm01 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21),
											_mm256_castsi256_ps(_pi32_4)));
	avx_m256i_t emm02 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22),
											_mm256_castsi256_ps(_pi32_4)));

	//avx_m256i_t cos_emm01 = _mm256_andnot_si256(cos_emm21, tempib);
	//avx_m256i_t cos_emm02 = _mm256_andnot_si256(cos_emm22, tempib);
	avx_m256i_t cos_emm01 = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(cos_emm21),
											_mm256_castsi256_ps(_pi32_4)));
	avx_m256i_t cos_emm02 = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(cos_emm22),
											_mm256_castsi256_ps(_pi32_4)));

	//emm01 = _mm256_slli_epi32(emm01, 29);
	__m128i emm0hi1 = _mm256_extractf128_si256(emm01, 0);
	__m128i emm0lo1 = _mm256_extractf128_si256(emm01, 1);
	emm0hi1 = _mm_slli_epi32(emm0hi1, 29);
	emm0lo1 = _mm_slli_epi32(emm0lo1, 29);
	emm01 = _mm256_insertf128_si256(emm01, emm0hi1, 0);
	emm01 = _mm256_insertf128_si256(emm01, emm0lo1, 1);

	//emm02 = _mm256_slli_epi32(emm02, 29);
	__m128i emm0hi2 = _mm256_extractf128_si256(emm02, 0);
	__m128i emm0lo2 = _mm256_extractf128_si256(emm02, 1);
	emm0hi2 = _mm_slli_epi32(emm0hi2, 29);
	emm0lo2 = _mm_slli_epi32(emm0lo2, 29);
	emm02 = _mm256_insertf128_si256(emm02, emm0hi1, 0);
	emm02 = _mm256_insertf128_si256(emm02, emm0lo1, 1);

	//cos_emm01 = _mm256_slli_epi32(cos_emm01, 29);
	__m128i cos_emm0hi1 = _mm256_extractf128_si256(cos_emm01, 0);
	__m128i cos_emm0lo1 = _mm256_extractf128_si256(cos_emm01, 1);
	cos_emm0hi1 = _mm_slli_epi32(cos_emm0hi1, 29);
	cos_emm0lo1 = _mm_slli_epi32(cos_emm0lo1, 29);
	cos_emm01 = _mm256_insertf128_si256(cos_emm01, cos_emm0hi1, 0);
	cos_emm01 = _mm256_insertf128_si256(cos_emm01, cos_emm0lo1, 1);

	//cos_emm02 = _mm256_slli_epi32(cos_emm02, 29);
	__m128i cos_emm0hi2 = _mm256_extractf128_si256(cos_emm02, 0);
	__m128i cos_emm0lo2 = _mm256_extractf128_si256(cos_emm02, 1);
	cos_emm0hi2 = _mm_slli_epi32(cos_emm0hi2, 29);
	cos_emm0lo2 = _mm_slli_epi32(cos_emm0lo2, 29);
	cos_emm02 = _mm256_insertf128_si256(cos_emm02, cos_emm0hi2, 0);
	cos_emm02 = _mm256_insertf128_si256(cos_emm02, cos_emm0lo2, 1);

	//tempia = _pi32_2;
	//tempib = _mm256_setzero_si256();
	//emm21 = _mm256_and_si256(emm21, tempia);
	//emm22 = _mm256_and_si256(emm22, tempia);
	emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21),
											_mm256_castsi256_ps(_pi32_2)));
	emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22),
											_mm256_castsi256_ps(_pi32_2)));

	//cos_emm21 = _mm256_and_si256(cos_emm21, tempia);
	//cos_emm22 = _mm256_and_si256(cos_emm22, tempia);
	cos_emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(cos_emm21),
											_mm256_castsi256_ps(_pi32_2)));
	cos_emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(cos_emm22),
											_mm256_castsi256_ps(_pi32_2)));

	//emm21 = _mm256_cmpeq_epi32(emm21, tempib);
	//emm22 = _mm256_cmpeq_epi32(emm22, tempib);
	emm21 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(emm21), _mm256_setzero_ps(), _CMP_EQ_UQ));
	emm22 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(emm22), _mm256_setzero_ps(), _CMP_EQ_UQ));

	//cos_emm21 = _mm256_cmpeq_epi32(cos_emm21, tempib);
	//cos_emm22 = _mm256_cmpeq_epi32(cos_emm22, tempib);
	cos_emm21 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(cos_emm21), _mm256_setzero_ps(), _CMP_EQ_UQ));
	cos_emm22 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(cos_emm22), _mm256_setzero_ps(), _CMP_EQ_UQ));
	
	avx_m256_t emm0f1 = _mm256_castsi256_ps(emm01);
	avx_m256_t emm0f2 = _mm256_castsi256_ps(emm02);
	avx_m256_t emm2f1 = _mm256_castsi256_ps(emm21);
	avx_m256_t emm2f2 = _mm256_castsi256_ps(emm22);
	avx_m256_t cos_emm0f1 = _mm256_castsi256_ps(cos_emm01);
	avx_m256_t cos_emm0f2 = _mm256_castsi256_ps(cos_emm02);
	avx_m256_t cos_emm2f1 = _mm256_castsi256_ps(cos_emm21);
	avx_m256_t cos_emm2f2 = _mm256_castsi256_ps(cos_emm22);

	sign_bit1 = _mm256_xor_ps(sign_bit1, emm0f1);
	sign_bit2 = _mm256_xor_ps(sign_bit2, emm0f2);

	tempa = _ps_minus_cephes_DP123;
	tempb = _mm256_mul_ps(y2, tempa);
	tempa = _mm256_mul_ps(y1, tempa);
	x2 = _mm256_add_ps(x2, tempb);
	x1 = _mm256_add_ps(x1, tempa);

	avx_m256_t x21 = _mm256_mul_ps(x1, x1);
	avx_m256_t x22 = _mm256_mul_ps(x2, x2);
	avx_m256_t x31 = _mm256_mul_ps(x21, x1);
	avx_m256_t x32 = _mm256_mul_ps(x22, x2);
	avx_m256_t x41 = _mm256_mul_ps(x21, x21);
	avx_m256_t x42 = _mm256_mul_ps(x22, x22);

	tempa = _ps_coscof_p0;
	tempb = _ps_sincof_p0;

	y1 = _mm256_mul_ps(x21, tempa);
	y2 = _mm256_mul_ps(x22, tempa);
	avx_m256_t y21 = _mm256_mul_ps(x21, tempb);
	avx_m256_t y22 = _mm256_mul_ps(x22, tempb);
	tempa = _ps_coscof_p1;
	tempb = _ps_sincof_p1;
	y1 = _mm256_add_ps(y1, tempa);
	y2 = _mm256_add_ps(y2, tempa);
	y21 = _mm256_add_ps(y21, tempb);
	y22 = _mm256_add_ps(y22, tempb);
	y1 = _mm256_mul_ps(y1, x21);
	y2 = _mm256_mul_ps(y2, x22);
	y21 = _mm256_mul_ps(y21, x21);
	y22 = _mm256_mul_ps(y22, x22);
	tempa = _ps_coscof_p2;
	tempb = _ps_sincof_p2;
	y1 = _mm256_add_ps(y1, tempa);
	y2 = _mm256_add_ps(y2, tempa);
	y21 = _mm256_add_ps(y21, tempb);
	y22 = _mm256_add_ps(y22, tempb);
	y1 = _mm256_mul_ps(y1, x41);
	y2 = _mm256_mul_ps(y2, x42);
	y21 = _mm256_mul_ps(y21, x31);
	y22 = _mm256_mul_ps(y22, x32);
	tempa = _ps_0p5;
	tempb = _ps_1;
	avx_m256_t temp_21 = _mm256_mul_ps(x21, tempa);
	avx_m256_t temp_22 = _mm256_mul_ps(x22, tempa);
	y21 = _mm256_add_ps(y21, x1);
	y22 = _mm256_add_ps(y22, x2);
	temp_21 = _mm256_sub_ps(temp_21, tempb);
	temp_22 = _mm256_sub_ps(temp_22, tempb);
	y1 = _mm256_sub_ps(y1, temp_21);
	y2 = _mm256_sub_ps(y2, temp_22);

	avx_m256_t cos_y1 = y1;
	avx_m256_t cos_y2 = y2;
	avx_m256_t cos_y21 = y21;
	avx_m256_t cos_y22 = y22;
	y1 = _mm256_andnot_ps(emm2f1, y1);
	y2 = _mm256_andnot_ps(emm2f2, y2);
	cos_y1 = _mm256_andnot_ps(cos_emm2f1, cos_y1);
	cos_y2 = _mm256_andnot_ps(cos_emm2f2, cos_y2);
	y21 = _mm256_and_ps(emm2f1, y21);
	y22 = _mm256_and_ps(emm2f2, y22);
	cos_y21 = _mm256_and_ps(cos_emm2f1, cos_y21);
	cos_y22 = _mm256_and_ps(cos_emm2f2, cos_y22);
	y1 = _mm256_add_ps(y1, y21);
	y2 = _mm256_add_ps(y2, y22);
	cos_y1 = _mm256_add_ps(cos_y1, cos_y21);
	cos_y2 = _mm256_add_ps(cos_y2, cos_y22);

	*s1 = _mm256_xor_ps(y1, sign_bit1);
	*s2 = _mm256_xor_ps(y2, sign_bit2);
	*c1 = _mm256_xor_ps(cos_y1, cos_emm0f1);
	*c2 = _mm256_xor_ps(cos_y2, cos_emm0f2);
} // newsincos_ps_dual()
Esempio n. 11
0
inline void newsincos_ps(avx_m256_t x, avx_m256_t *s, avx_m256_t *c) {
	avx_m256_t sign_bit = _mm256_and_ps(x, _ps_sign_mask);
	x = _mm256_and_ps(x, _ps_inv_sign_mask);

	avx_m256_t y = _mm256_mul_ps(x, _ps_cephes_FOPI);

	//avx_m256i_t emm2 = _mm256_cvttps_epi32(y);
	//emm2 = _mm256_add_epi32(emm2, _pi32_1);
	avx_m256i_t emm2 = _mm256_cvttps_epi32(_mm256_add_ps(y, _ps_1));

	//emm2 = _mm256_and_si256(emm2, _pi32_inv1);
	emm2 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm2), _mm256_castsi256_ps(_pi32_inv1)));

	y = _mm256_cvtepi32_ps(emm2);

	//avx_m256i_t cos_emm2 = _mm256_sub_epi32(emm2, _pi32_2);
	avx_m256i_t cos_emm2 = _mm256_cvtps_epi32(_mm256_sub_ps(_mm256_cvtepi32_ps(emm2), _ps_2));

	//avx_m256i_t emm0 = _mm256_and_si256(emm2, _pi32_4);
	avx_m256i_t emm0 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm2),
											_mm256_castsi256_ps(_pi32_4)));

	//avx_m256i_t cos_emm0 = _mm256_andnot_si256(cos_emm2, _pi32_4);
	avx_m256i_t cos_emm0 = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(cos_emm2),
											_mm256_castsi256_ps(_pi32_4)));

	//emm0 = _mm256_slli_epi32(emm0, 29);
	__m128i emm0hi = _mm256_extractf128_si256(emm0, 0);
	__m128i emm0lo = _mm256_extractf128_si256(emm0, 1);
	emm0hi = _mm_slli_epi32(emm0hi, 29);
	emm0lo = _mm_slli_epi32(emm0lo, 29);
	emm0 = _mm256_insertf128_si256(emm0, emm0hi, 0);
	emm0 = _mm256_insertf128_si256(emm0, emm0lo, 1);

	//cos_emm0 = _mm256_slli_epi32(cos_emm0, 29);
	__m128i cos_emm0hi = _mm256_extractf128_si256(cos_emm0, 0);
	__m128i cos_emm0lo = _mm256_extractf128_si256(cos_emm0, 1);
	cos_emm0hi = _mm_slli_epi32(cos_emm0hi, 29);
	cos_emm0lo = _mm_slli_epi32(cos_emm0lo, 29);
	cos_emm0 = _mm256_insertf128_si256(cos_emm0, cos_emm0hi, 0);
	cos_emm0 = _mm256_insertf128_si256(cos_emm0, cos_emm0lo, 1);

	//emm2 = _mm256_and_si256(emm2, _pi32_2);
	emm2 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm2),
											_mm256_castsi256_ps(_pi32_2)));

	//cos_emm2 = _mm256_and_si256(cos_emm2, _pi32_2);
	cos_emm2 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(cos_emm2),
											_mm256_castsi256_ps(_pi32_2)));

	//emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256());
	emm2 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(emm2), _mm256_setzero_ps(), _CMP_EQ_UQ));

	//cos_emm2 = _mm256_cmpeq_epi32(cos_emm2, _mm256_setzero_si256());
	cos_emm2 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(cos_emm2), _mm256_setzero_ps(), _CMP_EQ_UQ));

	avx_m256_t emm0f = _mm256_castsi256_ps(emm0);
	avx_m256_t emm2f = _mm256_castsi256_ps(emm2);
	avx_m256_t cos_emm0f = _mm256_castsi256_ps(cos_emm0);
	avx_m256_t cos_emm2f = _mm256_castsi256_ps(cos_emm2);

	sign_bit = _mm256_xor_ps(sign_bit, emm0f);

	avx_m256_t temp_2 = _ps_minus_cephes_DP123;
	temp_2 = _mm256_mul_ps(y, temp_2);
	x = _mm256_add_ps(x, temp_2);

	avx_m256_t x2 = _mm256_mul_ps(x, x);
	avx_m256_t x3 = _mm256_mul_ps(x2, x);
	avx_m256_t x4 = _mm256_mul_ps(x2, x2);

	y = _ps_coscof_p0;
	avx_m256_t y2 = _ps_sincof_p0;
	y = _mm256_mul_ps(y, x2);
	y2 = _mm256_mul_ps(y2, x2);
	y = _mm256_add_ps(y, _ps_coscof_p1);
	y2 = _mm256_add_ps(y2, _ps_sincof_p1);
	y = _mm256_mul_ps(y, x2);
	y2 = _mm256_mul_ps(y2, x2);
	y = _mm256_add_ps(y, _ps_coscof_p2);
	y2 = _mm256_add_ps(y2, _ps_sincof_p2);
	y = _mm256_mul_ps(y, x4);
	y2 = _mm256_mul_ps(y2, x3);
	temp_2 = _mm256_mul_ps(x2, _ps_0p5);
	y2 = _mm256_add_ps(y2, x);
	temp_2 = _mm256_sub_ps(temp_2, _ps_1);
	y = _mm256_sub_ps(y, temp_2);

	avx_m256_t cos_y = y;
	avx_m256_t cos_y2 = y2;
	y = _mm256_andnot_ps(emm2f, y);
	cos_y = _mm256_andnot_ps(cos_emm2f, cos_y);
	y2 = _mm256_and_ps(emm2f, y2);
	cos_y2 = _mm256_and_ps(cos_emm2f, cos_y2);
	y = _mm256_add_ps(y, y2);
	cos_y = _mm256_add_ps(cos_y, cos_y2);

	*s = _mm256_xor_ps(y, sign_bit);
	*c = _mm256_xor_ps(cos_y, cos_emm0f);
} // newsincos_ps()
Esempio n. 12
0
inline void newexp_ps_dual(avx_m256_t x1, avx_m256_t x2, avx_m256_t* exp1, avx_m256_t* exp2) {
	avx_m256_t one = _ps_1;
	avx_m256_t zero = _ps_0;

	x1 = _mm256_min_ps(x1, _ps_exp_hi);
	x2 = _mm256_min_ps(x2, _ps_exp_hi);
	x1 = _mm256_max_ps(x1, _ps_exp_lo);
	x2 = _mm256_max_ps(x2, _ps_exp_lo);

	avx_m256_t temp_21 = _mm256_mul_ps(x1, _ps_cephes_LOG2EF);
	avx_m256_t temp_22 = _mm256_mul_ps(x2, _ps_cephes_LOG2EF);
	temp_21 = _mm256_add_ps(temp_21, _ps_0p5);
	temp_22 = _mm256_add_ps(temp_22, _ps_0p5);

	avx_m256i_t emm01 = _mm256_cvttps_epi32(temp_21);
	avx_m256i_t emm02 = _mm256_cvttps_epi32(temp_22);
	avx_m256_t temp_11 = _mm256_cvtepi32_ps(emm01);
	avx_m256_t temp_12 = _mm256_cvtepi32_ps(emm02);
	avx_m256_t temp_31 = _mm256_sub_ps(temp_11, temp_21);
	avx_m256_t temp_32 = _mm256_sub_ps(temp_12, temp_22);
	avx_m256_t mask1 = _mm256_cmp_ps(temp_31, zero, _CMP_GT_OQ);
	avx_m256_t mask2 = _mm256_cmp_ps(temp_32, zero, _CMP_GT_OQ);

	mask1 = _mm256_and_ps(mask1, one);
	mask2 = _mm256_and_ps(mask2, one);
	temp_21 = _mm256_sub_ps(temp_11, mask1);
	temp_22 = _mm256_sub_ps(temp_12, mask2);
	emm01 = _mm256_cvttps_epi32(temp_21);
	emm02 = _mm256_cvttps_epi32(temp_22);

	temp_11 = _mm256_mul_ps(temp_21, _ps_cephes_exp_C12);
	temp_12 = _mm256_mul_ps(temp_22, _ps_cephes_exp_C12);
	x1 = _mm256_sub_ps(x1, temp_11);
	x2 = _mm256_sub_ps(x2, temp_12);

	avx_m256_t x21 = _mm256_mul_ps(x1, x1);
	avx_m256_t x22 = _mm256_mul_ps(x2, x2);
	avx_m256_t x31 = _mm256_mul_ps(x21, x1);
	avx_m256_t x32 = _mm256_mul_ps(x22, x2);
	avx_m256_t x41 = _mm256_mul_ps(x21, x21);
	avx_m256_t x42 = _mm256_mul_ps(x22, x22);
 
	temp_11 = _mm256_add_ps(x1, one);
	temp_12 = _mm256_add_ps(x2, one);
	temp_21 = _mm256_mul_ps(x21, _ps_cephes_exp_p5);
	temp_22 = _mm256_mul_ps(x22, _ps_cephes_exp_p5);
	temp_31 = _mm256_mul_ps(x31, _ps_cephes_exp_p4);
	temp_32 = _mm256_mul_ps(x32, _ps_cephes_exp_p4);
	temp_11 = _mm256_add_ps(temp_11, temp_21);
	temp_12 = _mm256_add_ps(temp_12, temp_22);

	temp_21 = _mm256_mul_ps(x31, _ps_cephes_exp_p0);
	temp_22 = _mm256_mul_ps(x32, _ps_cephes_exp_p0);

	temp_11 = _mm256_add_ps(temp_11, temp_31);
	temp_12 = _mm256_add_ps(temp_12, temp_32);

	avx_m256_t temp_41 = _mm256_mul_ps(x1, _ps_cephes_exp_p2);
	avx_m256_t temp_42 = _mm256_mul_ps(x2, _ps_cephes_exp_p2);
	temp_31 = _mm256_mul_ps(x21, _ps_cephes_exp_p1);
	temp_32 = _mm256_mul_ps(x22, _ps_cephes_exp_p1);

	//emm01 = _mm256_add_epi32(emm01, _pi32_0x7f);
	//emm02 = _mm256_add_epi32(emm02, _pi32_0x7f);
	emm01 = _mm256_castps_si256(_mm256_add_ps(_mm256_castsi256_ps(emm01), _mm256_castsi256_ps(_pi32_0x7f)));
	emm02 = _mm256_castps_si256(_mm256_add_ps(_mm256_castsi256_ps(emm02), _mm256_castsi256_ps(_pi32_0x7f)));

	temp_21 = _mm256_add_ps(temp_21, temp_31);
	temp_22 = _mm256_add_ps(temp_22, temp_32);
	temp_31 = _mm256_add_ps(temp_31, temp_41);
	temp_32 = _mm256_add_ps(temp_32, temp_42);

	//emm01 = _mm256_slli_epi32(emm01, 23);
	__m128i emm0hi1 = _mm256_extractf128_si256(emm01, 0);
	__m128i emm0lo1 = _mm256_extractf128_si256(emm01, 1);
	emm0hi1 = _mm_slli_epi32(emm0hi1, 23);
	emm0lo1 = _mm_slli_epi32(emm0lo1, 23);
	emm01 = _mm256_insertf128_si256(emm01, emm0hi1, 0);
	emm01 = _mm256_insertf128_si256(emm01, emm0lo1, 1);

	//emm02 = _mm256_slli_epi32(emm02, 23);
	__m128i emm0hi2 = _mm256_extractf128_si256(emm02, 0);
	__m128i emm0lo2 = _mm256_extractf128_si256(emm02, 1);
	emm0hi2 = _mm_slli_epi32(emm0hi2, 23);
	emm0lo2 = _mm_slli_epi32(emm0lo2, 23);
	emm02 = _mm256_insertf128_si256(emm02, emm0hi2, 0);
	emm02 = _mm256_insertf128_si256(emm02, emm0lo2, 1);

	avx_m256_t pow2n1 = _mm256_castsi256_ps(emm01);
	avx_m256_t pow2n2 = _mm256_castsi256_ps(emm02);

	temp_21 = _mm256_add_ps(temp_21, temp_31);
	temp_22 = _mm256_add_ps(temp_22, temp_32);
	temp_21 = _mm256_mul_ps(temp_21, x41);
	temp_22 = _mm256_mul_ps(temp_22, x42);

	avx_m256_t y1 = _mm256_add_ps(temp_11, temp_21);
	avx_m256_t y2 = _mm256_add_ps(temp_12, temp_22);

	*exp1 = _mm256_mul_ps(y1, pow2n1);
	*exp2 = _mm256_mul_ps(y2, pow2n2);
} // newexp_ps_dual()
Esempio n. 13
0
__m256 mm256_cos_ps(__m256 x) {
  __m256 xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y;
  __m256i emm0, emm2;
  /* take the absolute value */
  x = _mm256_and_ps(x, *(__m256*)m256_ps_inv_sign_mask);
  
  /* scale by 4/Pi */
  y = _mm256_mul_ps(x, *(__m256*)m256_ps_cephes_FOPI);

  /* store the integer part of y in mm0 */
  emm2 = _mm256_cvttps_epi32(y);
  /* j=(j+1) & (~1) (see the cephes sources) */
  emm2 = _mm256_add_epi32(emm2, *(__m256i*)m256_pi32_1);
  emm2 = _mm256_and_si256(emm2, *(__m256i*)m256_pi32_inv1);
  y = _mm256_cvtepi32_ps(emm2);

  emm2 = _mm256_sub_epi32(emm2, *(__m256i*)m256_pi32_2);
  
  /* get the swap sign flag */
  emm0 = _mm256_andnot_si256(emm2, *(__m256i*)m256_pi32_4);
  emm0 = _mm256_slli_epi32(emm0, 29);
  /* get the polynom selection mask */
  emm2 = _mm256_and_si256(emm2, *(__m256i*)m256_pi32_2);
  emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256());
  
  __m256 sign_bit = _mm256_castsi256_ps(emm0);
  __m256 poly_mask = _mm256_castsi256_ps(emm2);

  /* The magic pass: "******" 
     x = ((x - y * DP1) - y * DP2) - y * DP3; */
  xmm1 = *(__m256*)m256_ps_minus_cephes_DP1;
  xmm2 = *(__m256*)m256_ps_minus_cephes_DP2;
  xmm3 = *(__m256*)m256_ps_minus_cephes_DP3;
  xmm1 = _mm256_mul_ps(y, xmm1);
  xmm2 = _mm256_mul_ps(y, xmm2);
  xmm3 = _mm256_mul_ps(y, xmm3);
  x = _mm256_add_ps(x, xmm1);
  x = _mm256_add_ps(x, xmm2);
  x = _mm256_add_ps(x, xmm3);
  
  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
  y = *(__m256*)m256_ps_coscof_p0;
  __m256 z = _mm256_mul_ps(x,x);

  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, *(__m256*)m256_ps_coscof_p1);
  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, *(__m256*)m256_ps_coscof_p2);
  y = _mm256_mul_ps(y, z);
  y = _mm256_mul_ps(y, z);
  __m256 tmp = _mm256_mul_ps(z, *(__m256*)m256_ps_0p5);
  y = _mm256_sub_ps(y, tmp);
  y = _mm256_add_ps(y, *(__m256*)m256_ps_1);
  
  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

  __m256 y2 = *(__m256*)m256_ps_sincof_p0;
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_add_ps(y2, *(__m256*)m256_ps_sincof_p1);
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_add_ps(y2, *(__m256*)m256_ps_sincof_p2);
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_mul_ps(y2, x);
  y2 = _mm256_add_ps(y2, x);

  /* select the correct result from the two polynoms */  
  xmm3 = poly_mask;
  y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
  y = _mm256_andnot_ps(xmm3, y);
  y = _mm256_add_ps(y,y2);
  /* update the sign */
  y = _mm256_xor_ps(y, sign_bit);

  _mm256_zeroupper();
  return y;
}
Esempio n. 14
0
/* since sin256_ps and cos256_ps are almost identical, sincos256_ps could replace both of them..
   it is almost as fast, and gives you a free cosine with your sine */
void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {

  v8sf xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y;
  v8si imm0, imm2, imm4;

#ifndef __AVX2__
  v4si imm0_1, imm0_2;
  v4si imm2_1, imm2_2;
  v4si imm4_1, imm4_2;
#endif

  sign_bit_sin = x;
  /* take the absolute value */
  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
  /* extract the sign bit (upper one) */
  sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf*)_ps256_sign_mask);
  
  /* scale by 4/Pi */
  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);

#ifdef __AVX2__    
  /* store the integer part of y in imm2 */
  imm2 = _mm256_cvttps_epi32(y);

  /* j=(j+1) & (~1) (see the cephes sources) */
  imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
  imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_inv1);

  y = _mm256_cvtepi32_ps(imm2);
  imm4 = imm2;

  /* get the swap sign flag for the sine */
  imm0 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_4);
  imm0 = _mm256_slli_epi32(imm0, 29);
  //v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);

  /* get the polynom selection mask for the sine*/
  imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_2);
  imm2 = _mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0);
  //v8sf poly_mask = _mm256_castsi256_ps(imm2);
#else
  /* we use SSE2 routines to perform the integer ops */
  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);

  imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
  imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
  
  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);

  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
  y = _mm256_cvtepi32_ps(imm2);

  imm4_1 = imm2_1;
  imm4_2 = imm2_2;

  imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4);
  imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4);
  
  imm0_1 = _mm_slli_epi32(imm0_1, 29);
  imm0_2 = _mm_slli_epi32(imm0_2, 29);

  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);

  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);

  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());

  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
#endif
  v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
  v8sf poly_mask = _mm256_castsi256_ps(imm2);

  /* The magic pass: "******" 
     x = ((x - y * DP1) - y * DP2) - y * DP3; */
  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
  xmm1 = _mm256_mul_ps(y, xmm1);
  xmm2 = _mm256_mul_ps(y, xmm2);
  xmm3 = _mm256_mul_ps(y, xmm3);
  x = _mm256_add_ps(x, xmm1);
  x = _mm256_add_ps(x, xmm2);
  x = _mm256_add_ps(x, xmm3);

#ifdef __AVX2__
  imm4 = _mm256_sub_epi32(imm4, *(v8si*)_pi32_256_2);
  imm4 = _mm256_andnot_si128(imm4, *(v8si*)_pi32_256_4);
  imm4 = _mm256_slli_epi32(imm4, 29);
#else
  imm4_1 = _mm_sub_epi32(imm4_1, *(v4si*)_pi32avx_2);
  imm4_2 = _mm_sub_epi32(imm4_2, *(v4si*)_pi32avx_2);

  imm4_1 = _mm_andnot_si128(imm4_1, *(v4si*)_pi32avx_4);
  imm4_2 = _mm_andnot_si128(imm4_2, *(v4si*)_pi32avx_4);
  
  imm4_1 = _mm_slli_epi32(imm4_1, 29);
  imm4_2 = _mm_slli_epi32(imm4_2, 29);

  COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4);
#endif

  v8sf sign_bit_cos = _mm256_castsi256_ps(imm4);

  sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
  
  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
  v8sf z = _mm256_mul_ps(x,x);
  y = *(v8sf*)_ps256_coscof_p0;

  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
  y = _mm256_mul_ps(y, z);
  y = _mm256_mul_ps(y, z);
  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
  y = _mm256_sub_ps(y, tmp);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
  
  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_mul_ps(y2, x);
  y2 = _mm256_add_ps(y2, x);

  /* select the correct result from the two polynoms */  
  xmm3 = poly_mask;
  v8sf ysin2 = _mm256_and_ps(xmm3, y2);
  v8sf ysin1 = _mm256_andnot_ps(xmm3, y);
  y2 = _mm256_sub_ps(y2,ysin2);
  y = _mm256_sub_ps(y, ysin1);

  xmm1 = _mm256_add_ps(ysin1,ysin2);
  xmm2 = _mm256_add_ps(y,y2);
 
  /* update the sign */
  *s = _mm256_xor_ps(xmm1, sign_bit_sin);
  *c = _mm256_xor_ps(xmm2, sign_bit_cos);
}
Esempio n. 15
0
/* evaluation of 8 sines at onces using AVX intrisics

   The code is the exact rewriting of the cephes sinf function.
   Precision is excellent as long as x < 8192 (I did not bother to
   take into account the special handling they have for greater values
   -- it does not return garbage for arguments over 8192, though, but
   the extra precision is missing).

   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
   surprising but correct result.

*/
v8sf sin256_ps(v8sf x) { // any x
  v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
  v8si imm0, imm2;

#ifndef __AVX2__
  v4si imm0_1, imm0_2;
  v4si imm2_1, imm2_2;
#endif

  sign_bit = x;
  /* take the absolute value */
  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
  /* extract the sign bit (upper one) */
  sign_bit = _mm256_and_ps(sign_bit, *(v8sf*)_ps256_sign_mask);
  
  /* scale by 4/Pi */
  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);

  /*
    Here we start a series of integer operations, which are in the
    realm of AVX2.
    If we don't have AVX, let's perform them using SSE2 directives
  */

#ifdef __AVX2__
  /* store the integer part of y in mm0 */
  imm2 = _mm256_cvttps_epi32(y);
  /* j=(j+1) & (~1) (see the cephes sources) */
  // another two AVX2 instruction
  imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
  imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_inv1);
  y = _mm256_cvtepi32_ps(imm2);

  /* get the swap sign flag */
  imm0 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_4);
  imm0 = _mm256_slli_epi32(imm0, 29);
  /* get the polynom selection mask 
     there is one polynom for 0 <= x <= Pi/4
     and another one for Pi/4<x<=Pi/2

     Both branches will be computed.
  */
  imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_2);
  imm2 = _mm256_cmpeq_epi32(imm2,*(v8si*)_pi32_256_0);
#else
  /* we use SSE2 routines to perform the integer ops */
  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);

  imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
  imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);

  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);

  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
  y = _mm256_cvtepi32_ps(imm2);

  imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4);
  imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4);

  imm0_1 = _mm_slli_epi32(imm0_1, 29);
  imm0_2 = _mm_slli_epi32(imm0_2, 29);

  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);

  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);

  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());

  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
#endif
 
  v8sf swap_sign_bit = _mm256_castsi256_ps(imm0);
  v8sf poly_mask = _mm256_castsi256_ps(imm2);
  sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit);

  /* The magic pass: "******" 
     x = ((x - y * DP1) - y * DP2) - y * DP3; */
  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
  xmm1 = _mm256_mul_ps(y, xmm1);
  xmm2 = _mm256_mul_ps(y, xmm2);
  xmm3 = _mm256_mul_ps(y, xmm3);
  x = _mm256_add_ps(x, xmm1);
  x = _mm256_add_ps(x, xmm2);
  x = _mm256_add_ps(x, xmm3);

  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
  y = *(v8sf*)_ps256_coscof_p0;
  v8sf z = _mm256_mul_ps(x,x);

  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
  y = _mm256_mul_ps(y, z);
  y = _mm256_mul_ps(y, z);
  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
  y = _mm256_sub_ps(y, tmp);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
  
  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_mul_ps(y2, x);
  y2 = _mm256_add_ps(y2, x);

  /* select the correct result from the two polynoms */  
  xmm3 = poly_mask;
  y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
  y = _mm256_andnot_ps(xmm3, y);
  y = _mm256_add_ps(y,y2);
  /* update the sign */
  y = _mm256_xor_ps(y, sign_bit);

  return y;
}