double vNormalIntegral(double b) { __declspec(align(64)) __m512d vec_cf0, vec_cf1, vec_cf2, vec_s, vec_stp, vec_exp; //NN/2-1 has to be the multiple of 8 //NN = (8*LV+1)*2, LV = 20 -> NN = 322 //const int NN = 322; const int vecsize = 8; const int nCal = (NN/2-1)/vecsize; //const int left = NN%vecsize; double a = 0.0f; double s, h, sum = 0.0f; h = (b-a)/NN; // add in the first few terms sum += exp(-a*a/2.0) + 4.0*exp(-(a+h)*(a+h)/2.0); // and the last one sum += exp(-b*b/2.0); vec_cf0 = _mm512_set1_pd(a); vec_cf1 = _mm512_set1_pd(2*h); vec_cf2 = _mm512_set1_pd(-0.5); vec_s = _mm512_set_pd(8,7,6,5,4,3,2,1);//vectorize vec_s = _mm512_mul_pd(vec_s, vec_cf1);//(16h,14h,..,2h) vec_s = _mm512_add_pd(vec_cf0, vec_s);//(a+16h,..,a+2h) vec_stp = _mm512_set1_pd(2*h*vecsize-h); vec_cf0 = _mm512_set1_pd(h); for (int i = 0; i < nCal; ++i){ vec_exp = _mm512_mul_pd(vec_s, vec_s); vec_exp = _mm512_mul_pd(vec_exp, vec_cf2); vec_cf1 = _mm512_exp_pd(vec_exp);//vec_cf1->sum sum += 2.0*_mm512_reduce_add_pd(vec_cf1); vec_s = _mm512_add_pd(vec_s, vec_cf0);//s+=h vec_exp = _mm512_mul_pd(vec_s, vec_s); vec_exp = _mm512_mul_pd(vec_exp, vec_cf2); vec_cf1 = _mm512_exp_pd(vec_exp); sum += 4.0*_mm512_reduce_add_pd(vec_cf1); vec_s = _mm512_add_pd(vec_s, vec_stp); } sum = 0.5*sqrt(2*PI) + h*sum/3.0; return sum; }
double const* b; double complex const* e; double complex * f; int ix, iy, iz, n; #ifdef ARTED_STENCIL_LOOP_BLOCKING int bx, by; #endif __m512d at = _mm512_set1_pd(A); __m512d HALF = _mm512_set1_pd(-0.5); #ifdef TUNING_COMPLEX_MUL __m512i INV = _mm512_set4_epi64(1LL << 63, 0, 1LL << 63, 0); #else __m512d ZI = _mm512_set_pd(-1, 0, -1, 0, -1, 0, -1, 0); #endif __declspec(align(64)) double G[12]; for(n = 0 ; n < 12 ; ++n) G[n] = C[n] * -0.5; __m512i nly = _mm512_set1_epi32(PNLy); __m512i nlz = _mm512_set1_epi32(PNLz); #ifdef ARTED_DOMAIN_POWER_OF_TWO __m512i myx = _mm512_mask_blend_epi32(0xFF00, _mm512_set1_epi32(NLy - 1), _mm512_set1_epi32(NLx - 1)); __m512i nyx = _mm512_mask_blend_epi32(0xFF00, _mm512_set1_epi32(NLy ), _mm512_set1_epi32(NLx )); #else __m512i nyx = _mm512_mask_blend_epi32(0xFF00, _mm512_set1_epi32(PNLy ), _mm512_set1_epi32(PNLx )); #endif
foo (double x) { return _mm512_set_pd (x, x, x, x, x, x, x, x); }
foo (double *v) { return _mm512_set_pd (v[7], v[6], v[5], v[4], v[3], v[2], v[1], v[0]); }
, double complex const* restrict E , double * restrict F , double * restrict G , double * restrict H ) { #if defined(__KNC__) || defined(__AVX512F__) /* Hand-Code Vector processing for Xeon Phi */ double complex const* e; int ix, iy, iz, n; #ifdef ARTED_STENCIL_LOOP_BLOCKING int bx, by; #endif __m512d CONJ = _mm512_set_pd(-1, 1, -1, 1, -1, 1, -1, 1); __m512d tt[3]; for(n = 0 ; n < 3 ; ++n) tt[n] = _mm512_setzero_pd(); __m512d wm[4]; __m512d wp[4]; __m512d v1, v2, v3; __m512i nly = _mm512_set1_epi32(NLy); __m512i nlz = _mm512_set1_epi32(NLz); __m512i nyx = _mm512_mask_blend_epi32(0xFF00, _mm512_set1_epi32(NLy ), _mm512_set1_epi32(NLx )); #ifdef ARTED_DOMAIN_POWER_OF_TWO __m512i myx = _mm512_mask_blend_epi32(0xFF00, _mm512_set1_epi32(NLy - 1), _mm512_set1_epi32(NLx - 1)); #endif