double vNormalIntegral(double b) { __declspec(align(64)) __m512d vec_cf0, vec_cf1, vec_cf2, vec_s, vec_stp, vec_exp; //NN/2-1 has to be the multiple of 8 //NN = (8*LV+1)*2, LV = 20 -> NN = 322 //const int NN = 322; const int vecsize = 8; const int nCal = (NN/2-1)/vecsize; //const int left = NN%vecsize; double a = 0.0f; double s, h, sum = 0.0f; h = (b-a)/NN; // add in the first few terms sum += exp(-a*a/2.0) + 4.0*exp(-(a+h)*(a+h)/2.0); // and the last one sum += exp(-b*b/2.0); vec_cf0 = _mm512_set1_pd(a); vec_cf1 = _mm512_set1_pd(2*h); vec_cf2 = _mm512_set1_pd(-0.5); vec_s = _mm512_set_pd(8,7,6,5,4,3,2,1);//vectorize vec_s = _mm512_mul_pd(vec_s, vec_cf1);//(16h,14h,..,2h) vec_s = _mm512_add_pd(vec_cf0, vec_s);//(a+16h,..,a+2h) vec_stp = _mm512_set1_pd(2*h*vecsize-h); vec_cf0 = _mm512_set1_pd(h); for (int i = 0; i < nCal; ++i){ vec_exp = _mm512_mul_pd(vec_s, vec_s); vec_exp = _mm512_mul_pd(vec_exp, vec_cf2); vec_cf1 = _mm512_exp_pd(vec_exp);//vec_cf1->sum sum += 2.0*_mm512_reduce_add_pd(vec_cf1); vec_s = _mm512_add_pd(vec_s, vec_cf0);//s+=h vec_exp = _mm512_mul_pd(vec_s, vec_s); vec_exp = _mm512_mul_pd(vec_exp, vec_cf2); vec_cf1 = _mm512_exp_pd(vec_exp); sum += 4.0*_mm512_reduce_add_pd(vec_cf1); vec_s = _mm512_add_pd(vec_s, vec_stp); } sum = 0.5*sqrt(2*PI) + h*sum/3.0; return sum; }
double test_mm512_reduce_add_pd(__m512d __W){ // CHECK: %shuffle.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> // CHECK: %shuffle1.i = shufflevector <8 x double> %__W, <8 x double> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> // CHECK: %add.i = fadd <4 x double> %shuffle.i, %shuffle1.i // CHECK: %shuffle2.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 0, i32 1> // CHECK: %shuffle3.i = shufflevector <4 x double> %add.i, <4 x double> undef, <2 x i32> <i32 2, i32 3> // CHECK: %add4.i = fadd <2 x double> %shuffle2.i, %shuffle3.i // CHECK: %shuffle6.i = shufflevector <2 x double> %add4.i, <2 x double> undef, <2 x i32> <i32 1, i32 undef> // CHECK: %add7.i = fadd <2 x double> %add4.i, %shuffle6.i // CHECK: %vecext.i = extractelement <2 x double> %add7.i, i32 0 // CHECK: ret double %vecext.i return _mm512_reduce_add_pd(__W); }