Example #1
0
double vNormalIntegral(double b)
{
  __declspec(align(64)) __m512d vec_cf0, vec_cf1, vec_cf2, vec_s, vec_stp, vec_exp; 
  //NN/2-1 has to be the multiple of 8
  //NN = (8*LV+1)*2, LV = 20 -> NN = 322
  //const int NN = 322; 
  const int vecsize = 8; 
  const int nCal = (NN/2-1)/vecsize;
  //const int left = NN%vecsize;
  double a = 0.0f;
  double s, h, sum = 0.0f;
  h = (b-a)/NN;
  // add in the first few terms 
  sum += exp(-a*a/2.0) + 4.0*exp(-(a+h)*(a+h)/2.0);
  // and the last one
  sum += exp(-b*b/2.0);

  vec_cf0 = _mm512_set1_pd(a);
  vec_cf1 = _mm512_set1_pd(2*h);
  vec_cf2 = _mm512_set1_pd(-0.5);

  vec_s   = _mm512_set_pd(8,7,6,5,4,3,2,1);//vectorize
  vec_s   = _mm512_mul_pd(vec_s, vec_cf1);//(16h,14h,..,2h)
  vec_s   = _mm512_add_pd(vec_cf0, vec_s);//(a+16h,..,a+2h)
  
  vec_stp = _mm512_set1_pd(2*h*vecsize-h);
  vec_cf0 = _mm512_set1_pd(h);
  
  for (int i = 0; i < nCal; ++i){
    vec_exp = _mm512_mul_pd(vec_s, vec_s);
    vec_exp = _mm512_mul_pd(vec_exp, vec_cf2);
    vec_cf1 = _mm512_exp_pd(vec_exp);//vec_cf1->sum
    sum    += 2.0*_mm512_reduce_add_pd(vec_cf1);

    vec_s   = _mm512_add_pd(vec_s, vec_cf0);//s+=h
    vec_exp = _mm512_mul_pd(vec_s, vec_s);
    vec_exp = _mm512_mul_pd(vec_exp, vec_cf2);
    vec_cf1 = _mm512_exp_pd(vec_exp);
    sum    += 4.0*_mm512_reduce_add_pd(vec_cf1);
    
    vec_s   = _mm512_add_pd(vec_s, vec_stp);
  }

  sum = 0.5*sqrt(2*PI) + h*sum/3.0;
  return sum;
}
Example #2
0
double test_mm512_reduce_add_pd(__m512d __W){
  // CHECK: %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  // CHECK: %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
  // CHECK: %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i
  // CHECK: %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 0, i32 1>
  // CHECK: %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 2, i32 3>
  // CHECK: %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i
  // CHECK: %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
  // CHECK: %add7.i = fadd <2 x double> %add4.i, %shuffle6.i
  // CHECK: %vecext.i = extractelement <2 x double> %add7.i, i32 0
  // CHECK: ret double %vecext.i
  return _mm512_reduce_add_pd(__W); 
}