示例#1
0
double vNormalIntegral(double b)
{
  __declspec(align(64)) __m512d vec_cf0, vec_cf1, vec_cf2, vec_s, vec_stp, vec_exp; 
  //NN/2-1 has to be the multiple of 8
  //NN = (8*LV+1)*2, LV = 20 -> NN = 322
  //const int NN = 322; 
  const int vecsize = 8; 
  const int nCal = (NN/2-1)/vecsize;
  //const int left = NN%vecsize;
  double a = 0.0f;
  double s, h, sum = 0.0f;
  h = (b-a)/NN;
  // add in the first few terms 
  sum += exp(-a*a/2.0) + 4.0*exp(-(a+h)*(a+h)/2.0);
  // and the last one
  sum += exp(-b*b/2.0);

  vec_cf0 = _mm512_set1_pd(a);
  vec_cf1 = _mm512_set1_pd(2*h);
  vec_cf2 = _mm512_set1_pd(-0.5);

  vec_s   = _mm512_set_pd(8,7,6,5,4,3,2,1);//vectorize
  vec_s   = _mm512_mul_pd(vec_s, vec_cf1);//(16h,14h,..,2h)
  vec_s   = _mm512_add_pd(vec_cf0, vec_s);//(a+16h,..,a+2h)
  
  vec_stp = _mm512_set1_pd(2*h*vecsize-h);
  vec_cf0 = _mm512_set1_pd(h);
  
  for (int i = 0; i < nCal; ++i){
    vec_exp = _mm512_mul_pd(vec_s, vec_s);
    vec_exp = _mm512_mul_pd(vec_exp, vec_cf2);
    vec_cf1 = _mm512_exp_pd(vec_exp);//vec_cf1->sum
    sum    += 2.0*_mm512_reduce_add_pd(vec_cf1);

    vec_s   = _mm512_add_pd(vec_s, vec_cf0);//s+=h
    vec_exp = _mm512_mul_pd(vec_s, vec_s);
    vec_exp = _mm512_mul_pd(vec_exp, vec_cf2);
    vec_cf1 = _mm512_exp_pd(vec_exp);
    sum    += 4.0*_mm512_reduce_add_pd(vec_cf1);
    
    vec_s   = _mm512_add_pd(vec_s, vec_stp);
  }

  sum = 0.5*sqrt(2*PI) + h*sum/3.0;
  return sum;
}
示例#2
0
文件: hpsi.c 项目: ARTED/ARTED
  double         const* b;
  double complex const* e;
  double complex      * f;

  int ix, iy, iz, n;

#ifdef ARTED_STENCIL_LOOP_BLOCKING
  int bx, by;
#endif

  __m512d at   = _mm512_set1_pd(A);
  __m512d HALF = _mm512_set1_pd(-0.5);
#ifdef TUNING_COMPLEX_MUL
  __m512i INV  = _mm512_set4_epi64(1LL << 63, 0, 1LL << 63, 0);
#else
  __m512d ZI   = _mm512_set_pd(-1, 0, -1, 0, -1, 0, -1, 0);
#endif

  __declspec(align(64)) double G[12];
  for(n = 0 ; n < 12 ; ++n)
    G[n] = C[n] * -0.5;

  __m512i nly = _mm512_set1_epi32(PNLy);
  __m512i nlz = _mm512_set1_epi32(PNLz);
#ifdef ARTED_DOMAIN_POWER_OF_TWO
  __m512i myx = _mm512_mask_blend_epi32(0xFF00, _mm512_set1_epi32(NLy - 1), _mm512_set1_epi32(NLx - 1));
  __m512i nyx = _mm512_mask_blend_epi32(0xFF00, _mm512_set1_epi32(NLy    ), _mm512_set1_epi32(NLx    ));
#else
  __m512i nyx = _mm512_mask_blend_epi32(0xFF00, _mm512_set1_epi32(PNLy   ), _mm512_set1_epi32(PNLx   ));
#endif
示例#3
0
foo (double x)
{
  return _mm512_set_pd (x, x, x, x, x, x, x, x);
}
示例#4
0
foo (double *v)
{
  return _mm512_set_pd (v[7], v[6], v[5], v[4], v[3], v[2], v[1], v[0]);
}
                     , double complex const* restrict E
                     , double              * restrict F
                     , double              * restrict G
                     , double              * restrict H
) {
#if defined(__KNC__) || defined(__AVX512F__)
/* Hand-Code Vector processing for Xeon Phi */
  double complex const* e;

  int ix, iy, iz, n;

#ifdef ARTED_STENCIL_LOOP_BLOCKING
  int bx, by;
#endif

  __m512d CONJ = _mm512_set_pd(-1, 1, -1, 1, -1, 1, -1, 1);

  __m512d tt[3];
  for(n = 0 ; n < 3 ; ++n)
    tt[n] = _mm512_setzero_pd();

  __m512d wm[4];
  __m512d wp[4];
  __m512d v1, v2, v3;

  __m512i nly = _mm512_set1_epi32(NLy);
  __m512i nlz = _mm512_set1_epi32(NLz);
  __m512i nyx = _mm512_mask_blend_epi32(0xFF00, _mm512_set1_epi32(NLy    ), _mm512_set1_epi32(NLx    ));
#ifdef ARTED_DOMAIN_POWER_OF_TWO
  __m512i myx = _mm512_mask_blend_epi32(0xFF00, _mm512_set1_epi32(NLy - 1), _mm512_set1_epi32(NLx - 1));
#endif