double vNormalIntegral(double b) { __declspec(align(64)) __m512d vec_cf0, vec_cf1, vec_cf2, vec_s, vec_stp, vec_exp; //NN/2-1 has to be the multiple of 8 //NN = (8*LV+1)*2, LV = 20 -> NN = 322 //const int NN = 322; const int vecsize = 8; const int nCal = (NN/2-1)/vecsize; //const int left = NN%vecsize; double a = 0.0f; double s, h, sum = 0.0f; h = (b-a)/NN; // add in the first few terms sum += exp(-a*a/2.0) + 4.0*exp(-(a+h)*(a+h)/2.0); // and the last one sum += exp(-b*b/2.0); vec_cf0 = _mm512_set1_pd(a); vec_cf1 = _mm512_set1_pd(2*h); vec_cf2 = _mm512_set1_pd(-0.5); vec_s = _mm512_set_pd(8,7,6,5,4,3,2,1);//vectorize vec_s = _mm512_mul_pd(vec_s, vec_cf1);//(16h,14h,..,2h) vec_s = _mm512_add_pd(vec_cf0, vec_s);//(a+16h,..,a+2h) vec_stp = _mm512_set1_pd(2*h*vecsize-h); vec_cf0 = _mm512_set1_pd(h); for (int i = 0; i < nCal; ++i){ vec_exp = _mm512_mul_pd(vec_s, vec_s); vec_exp = _mm512_mul_pd(vec_exp, vec_cf2); vec_cf1 = _mm512_exp_pd(vec_exp);//vec_cf1->sum sum += 2.0*_mm512_reduce_add_pd(vec_cf1); vec_s = _mm512_add_pd(vec_s, vec_cf0);//s+=h vec_exp = _mm512_mul_pd(vec_s, vec_s); vec_exp = _mm512_mul_pd(vec_exp, vec_cf2); vec_cf1 = _mm512_exp_pd(vec_exp); sum += 4.0*_mm512_reduce_add_pd(vec_cf1); vec_s = _mm512_add_pd(vec_s, vec_stp); } sum = 0.5*sqrt(2*PI) + h*sum/3.0; return sum; }
void extern avx512f_test (void) { x = _mm512_add_pd (x, x); x = _mm512_mask_add_pd (x, m, x, x); x = _mm512_maskz_add_pd (m, x, x); x = _mm512_add_round_pd (x, x, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); x = _mm512_mask_add_round_pd (x, m, x, x, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); x = _mm512_maskz_add_round_pd (m, x, x, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); }
int main(int argc, char** argv) { /* variable declaration */ DB(DB_LVL, "declaration"); DT * memA_t0, *memA_t1, *memA_t2, *memA_t3; DT * memB_t0, *memB_t1, *memB_t2, *memB_t3; DT * memO_t0, *memO_t1, *memO_t2, *memO_t3; int reps, size; int samples; int tid; int i, p, r, bytes, elems; int bytes_min, bytes_max; int elems_min, elems_max; double func_overhead; double t_start, t_end; double t_min, c_min; double alpha = 0.5; DB(DB_LVL, SEPERATOR); /* initialization */ DB(DB_LVL, "intialization"); samples = 3; bytes_min = 1024, bytes_max = 1024*32; /* [1KB, 32KB] */ elems_min = bytes_min/sizeof(DT), elems_max = bytes_max/sizeof(DT); /* the number of elements */ reps = 40000; DB(DB_LVL, SEPERATOR); /* omp environment */ const int nthreads = argc > 1 ? atoi(argv[1]) : 4; fprintf(stderr , "nthreads= %d\n", nthreads); omp_set_num_threads(nthreads); /* iteration */ DB(DB_LVL, "measurement"); for(elems=elems_min, bytes=bytes_min; elems<=elems_max; elems=elems+elems_min, bytes=bytes+bytes_min) { memA_t0 = (DT *)_mm_malloc(bytes_max, 64); memA_t1 = (DT *)_mm_malloc(bytes_max, 64); memA_t2 = (DT *)_mm_malloc(bytes_max, 64); memA_t3 = (DT *)_mm_malloc(bytes_max, 64); memB_t0 = (DT *)_mm_malloc(bytes_max, 64); memB_t1 = (DT *)_mm_malloc(bytes_max, 64); memB_t2 = (DT *)_mm_malloc(bytes_max, 64); memB_t3 = (DT *)_mm_malloc(bytes_max, 64); memO_t0 = (DT *)_mm_malloc(bytes_max, 64); memO_t1 = (DT *)_mm_malloc(bytes_max, 64); memO_t2 = (DT *)_mm_malloc(bytes_max, 64); memO_t3 = (DT *)_mm_malloc(bytes_max, 64); /* initialization a local space */ fill(memA_t0, elems, 1.0); fill(memA_t1, elems, 2.0); fill(memA_t2, elems, 3.0); fill(memA_t3, elems, 4.0); fill(memB_t0, elems, 1.0); fill(memB_t1, elems, 2.0); fill(memB_t2, elems, 3.0); fill(memB_t3, elems, 4.0); fill(memO_t0, elems, 1.0); fill(memO_t1, elems, 2.0); fill(memO_t2, elems, 3.0); fill(memO_t3, elems, 4.0); /* measurement */ t_min = 0.0f; c_min = 0.0f; DT ret_t0 = 0.0; DT ret_t1 = 0.0; DT ret_t2 = 0.0; DT ret_t3 = 0.0; #ifdef SAXPY2 #define Z _z #else #define Z _z #endif for(p=0; p<samples; p++) { __m512d *_x, *_y, *_z; #pragma omp parallel private(_x,_y,_z) default(shared) { int tid; tid = omp_get_thread_num(); switch(tid) { case 0: _x = (__m512d*)memA_t0; _y = (__m512d*)memB_t0; _z = (__m512d*)memO_t0; break; case 1: _x = (__m512d*)memA_t1; _y = (__m512d*)memB_t1; _z = (__m512d*)memO_t1; break; case 2: _x = (__m512d*)memA_t2; _y = (__m512d*)memB_t2; _z = (__m512d*)memO_t2; break; case 3: _x = (__m512d*)memA_t3; _y = (__m512d*)memB_t3; _z = (__m512d*)memO_t3; break; default: assert(0); } #pragma omp barrier if(p==(samples-1)) t_start = timer(); int r; for(r=0; r<reps; r++) { asm("#t0-beg"); #if 0 double *memO_t0 = (double*)Z; const double *memA_t0 = (double*)_x; const double *memB_t0 = (double*)_y; #pragma vector aligned for(i=0; i<elems; i=i+1) { //ret_t0 += mem_t0[i]; memO_t0[i] = alpha * memA_t0[i] + memB_t0[i]; } memO_t0[0] = memO_t0[0] * 0.1; // to avoid overflow and optimizations #else const int cnts = elems >> 3; const __m512d _a = _mm512_set1_pd(alpha); int ib; for (ib = 0; ib < cnts; ib += 8*8) { Z[ib+0] = _mm512_add_pd(_y[ib+0], _mm512_mul_pd(_a,_x[ib+0])); Z[ib+1] = _mm512_add_pd(_y[ib+1], _mm512_mul_pd(_a,_x[ib+1])); Z[ib+2] = _mm512_add_pd(_y[ib+2], _mm512_mul_pd(_a,_x[ib+2])); Z[ib+3] = _mm512_add_pd(_y[ib+3], _mm512_mul_pd(_a,_x[ib+3])); Z[ib+4] = _mm512_add_pd(_y[ib+4], _mm512_mul_pd(_a,_x[ib+4])); Z[ib+5] = _mm512_add_pd(_y[ib+5], _mm512_mul_pd(_a,_x[ib+5])); Z[ib+6] = _mm512_add_pd(_y[ib+6], _mm512_mul_pd(_a,_x[ib+6])); Z[ib+7] = _mm512_add_pd(_y[ib+7], _mm512_mul_pd(_a,_x[ib+7])); Z[ib+8+0] = _mm512_add_pd(_y[ib+8+0], _mm512_mul_pd(_a,_x[ib+8+0])); Z[ib+8+1] = _mm512_add_pd(_y[ib+8+1], _mm512_mul_pd(_a,_x[ib+8+1])); Z[ib+8+2] = _mm512_add_pd(_y[ib+8+2], _mm512_mul_pd(_a,_x[ib+8+2])); Z[ib+8+3] = _mm512_add_pd(_y[ib+8+3], _mm512_mul_pd(_a,_x[ib+8+3])); Z[ib+8+4] = _mm512_add_pd(_y[ib+8+4], _mm512_mul_pd(_a,_x[ib+8+4])); Z[ib+8+5] = _mm512_add_pd(_y[ib+8+5], _mm512_mul_pd(_a,_x[ib+8+5])); Z[ib+8+6] = _mm512_add_pd(_y[ib+8+6], _mm512_mul_pd(_a,_x[ib+8+6])); Z[ib+8+7] = _mm512_add_pd(_y[ib+8+7], _mm512_mul_pd(_a,_x[ib+8+7])); Z[ib+16+0] = _mm512_add_pd(_y[ib+16+0], _mm512_mul_pd(_a,_x[ib+16+0])); Z[ib+16+1] = _mm512_add_pd(_y[ib+16+1], _mm512_mul_pd(_a,_x[ib+16+1])); Z[ib+16+2] = _mm512_add_pd(_y[ib+16+2], _mm512_mul_pd(_a,_x[ib+16+2])); Z[ib+16+3] = _mm512_add_pd(_y[ib+16+3], _mm512_mul_pd(_a,_x[ib+16+3])); Z[ib+16+4] = _mm512_add_pd(_y[ib+16+4], _mm512_mul_pd(_a,_x[ib+16+4])); Z[ib+16+5] = _mm512_add_pd(_y[ib+16+5], _mm512_mul_pd(_a,_x[ib+16+5])); Z[ib+16+6] = _mm512_add_pd(_y[ib+16+6], _mm512_mul_pd(_a,_x[ib+16+6])); Z[ib+16+7] = _mm512_add_pd(_y[ib+16+7], _mm512_mul_pd(_a,_x[ib+16+7])); Z[ib+24+0] = _mm512_add_pd(_y[ib+24+0], _mm512_mul_pd(_a,_x[ib+24+0])); Z[ib+24+1] = _mm512_add_pd(_y[ib+24+1], _mm512_mul_pd(_a,_x[ib+24+1])); Z[ib+24+2] = _mm512_add_pd(_y[ib+24+2], _mm512_mul_pd(_a,_x[ib+24+2])); Z[ib+24+3] = _mm512_add_pd(_y[ib+24+3], _mm512_mul_pd(_a,_x[ib+24+3])); Z[ib+24+4] = _mm512_add_pd(_y[ib+24+4], _mm512_mul_pd(_a,_x[ib+24+4])); Z[ib+24+5] = _mm512_add_pd(_y[ib+24+5], _mm512_mul_pd(_a,_x[ib+24+5])); Z[ib+24+6] = _mm512_add_pd(_y[ib+24+6], _mm512_mul_pd(_a,_x[ib+24+6])); Z[ib+24+7] = _mm512_add_pd(_y[ib+24+7], _mm512_mul_pd(_a,_x[ib+24+7])); Z[ib+32+0] = _mm512_add_pd(_y[ib+32+0], _mm512_mul_pd(_a,_x[ib+32+0])); Z[ib+32+1] = _mm512_add_pd(_y[ib+32+1], _mm512_mul_pd(_a,_x[ib+32+1])); Z[ib+32+2] = _mm512_add_pd(_y[ib+32+2], _mm512_mul_pd(_a,_x[ib+32+2])); Z[ib+32+3] = _mm512_add_pd(_y[ib+32+3], _mm512_mul_pd(_a,_x[ib+32+3])); Z[ib+32+4] = _mm512_add_pd(_y[ib+32+4], _mm512_mul_pd(_a,_x[ib+32+4])); Z[ib+32+5] = _mm512_add_pd(_y[ib+32+5], _mm512_mul_pd(_a,_x[ib+32+5])); Z[ib+32+6] = _mm512_add_pd(_y[ib+32+6], _mm512_mul_pd(_a,_x[ib+32+6])); Z[ib+32+7] = _mm512_add_pd(_y[ib+32+7], _mm512_mul_pd(_a,_x[ib+32+7])); Z[ib+40+0] = _mm512_add_pd(_y[ib+40+0], _mm512_mul_pd(_a,_x[ib+40+0])); Z[ib+40+1] = _mm512_add_pd(_y[ib+40+1], _mm512_mul_pd(_a,_x[ib+40+1])); Z[ib+40+2] = _mm512_add_pd(_y[ib+40+2], _mm512_mul_pd(_a,_x[ib+40+2])); Z[ib+40+3] = _mm512_add_pd(_y[ib+40+3], _mm512_mul_pd(_a,_x[ib+40+3])); Z[ib+40+4] = _mm512_add_pd(_y[ib+40+4], _mm512_mul_pd(_a,_x[ib+40+4])); Z[ib+40+5] = _mm512_add_pd(_y[ib+40+5], _mm512_mul_pd(_a,_x[ib+40+5])); Z[ib+40+6] = _mm512_add_pd(_y[ib+40+6], _mm512_mul_pd(_a,_x[ib+40+6])); Z[ib+40+7] = _mm512_add_pd(_y[ib+40+7], _mm512_mul_pd(_a,_x[ib+40+7])); Z[ib+48+0] = _mm512_add_pd(_y[ib+48+0], _mm512_mul_pd(_a,_x[ib+48+0])); Z[ib+48+1] = _mm512_add_pd(_y[ib+48+1], _mm512_mul_pd(_a,_x[ib+48+1])); Z[ib+48+2] = _mm512_add_pd(_y[ib+48+2], _mm512_mul_pd(_a,_x[ib+48+2])); Z[ib+48+3] = _mm512_add_pd(_y[ib+48+3], _mm512_mul_pd(_a,_x[ib+48+3])); Z[ib+48+4] = _mm512_add_pd(_y[ib+48+4], _mm512_mul_pd(_a,_x[ib+48+4])); Z[ib+48+5] = _mm512_add_pd(_y[ib+48+5], _mm512_mul_pd(_a,_x[ib+48+5])); Z[ib+48+6] = _mm512_add_pd(_y[ib+48+6], _mm512_mul_pd(_a,_x[ib+48+6])); Z[ib+48+7] = _mm512_add_pd(_y[ib+48+7], _mm512_mul_pd(_a,_x[ib+48+7])); Z[ib+56+0] = _mm512_add_pd(_y[ib+56+0], _mm512_mul_pd(_a,_x[ib+56+0])); Z[ib+56+1] = _mm512_add_pd(_y[ib+56+1], _mm512_mul_pd(_a,_x[ib+56+1])); Z[ib+56+2] = _mm512_add_pd(_y[ib+56+2], _mm512_mul_pd(_a,_x[ib+56+2])); Z[ib+56+3] = _mm512_add_pd(_y[ib+56+3], _mm512_mul_pd(_a,_x[ib+56+3])); Z[ib+56+4] = _mm512_add_pd(_y[ib+56+4], _mm512_mul_pd(_a,_x[ib+56+4])); Z[ib+56+5] = _mm512_add_pd(_y[ib+56+5], _mm512_mul_pd(_a,_x[ib+56+5])); Z[ib+56+6] = _mm512_add_pd(_y[ib+56+6], _mm512_mul_pd(_a,_x[ib+56+6])); Z[ib+56+7] = _mm512_add_pd(_y[ib+56+7], _mm512_mul_pd(_a,_x[ib+56+7])); } #endif asm("#t0-end"); } } if(p==(samples-1)) t_end = timer(); } t_min = (t_end - t_start)/reps; printf("%lf,%lf,%lf,%lf\n", ret_t0, ret_t1, ret_t2, ret_t3); SAVE_DATA("%lf\t", 3*nthreads*bytes/t_min); printf("cbw: %lf\t elems= %d mem_tot= %d\n", 3*nthreads*bytes/t_min, elems/8, 3*elems*sizeof(DT)*nthreads); if(memA_t0!=NULL) _mm_free(memA_t0); if(memA_t1!=NULL) _mm_free(memA_t1); if(memA_t2!=NULL) _mm_free(memA_t2); if(memA_t3!=NULL) _mm_free(memA_t3); if(memB_t0!=NULL) _mm_free(memB_t0); if(memB_t1!=NULL) _mm_free(memB_t1); if(memB_t2!=NULL) _mm_free(memB_t2); if(memB_t3!=NULL) _mm_free(memB_t3); if(memO_t0!=NULL) _mm_free(memO_t0); if(memO_t1!=NULL) _mm_free(memO_t1); if(memO_t2!=NULL) _mm_free(memO_t2); if(memO_t3!=NULL) _mm_free(memO_t3); } DB(DB_LVL, SEPERATOR); /* post-process */ DB(DB_LVL, "post-process"); DB(DB_LVL, SEPERATOR); }
void ks_rank_k_int_d16x14( int k, double *a, double *b, double *c, int ldc, aux_t *aux ) { int i; double neg2 = -2.0; v8df_t c007_0, c007_1, c007_2, c007_3, c007_4; v8df_t c007_5, c007_6, c007_7, c007_8, c007_9; v8df_t c007_10, c007_11, c007_12, c007_13; v8df_t c815_0, c815_1, c815_2, c815_3, c815_4; v8df_t c815_5, c815_6, c815_7, c815_8, c815_9; v8df_t c815_10, c815_11, c815_12, c815_13; v8df_t a007, a815, b_tmp; int k_iter = k; // TODO: need to clean the c buffer. for ( i = 0; i < k_iter; ++ i ) { a007.v = _mm512_load_pd( a ); a815.v = _mm512_load_pd( a + 8 ); //printf( "%lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf\n", // a007.d[ 0 ], a007.d[ 1 ], a007.d[ 2 ], a007.d[ 3 ], // a007.d[ 4 ], a007.d[ 5 ], a007.d[ 6 ], a007.d[ 7 ] ); b_tmp.v = _mm512_extload_pd( b, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); //printf( "b[ 0 ] = %lf\n", b[ 0 ] ); //printf( "%lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf\n", // b_tmp.d[ 0 ], b_tmp.d[ 1 ], b_tmp.d[ 2 ], b_tmp.d[ 3 ], // b_tmp.d[ 4 ], b_tmp.d[ 5 ], b_tmp.d[ 6 ], b_tmp.d[ 7 ] ); c007_0.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_0.v ); c815_0.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_0.v ); b_tmp.v = _mm512_extload_pd( b + 1, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_1.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_1.v ); c815_1.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_1.v ); b_tmp.v = _mm512_extload_pd( b + 2, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_2.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_2.v ); c815_2.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_2.v ); b_tmp.v = _mm512_extload_pd( b + 3, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_3.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_3.v ); c815_3.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_3.v ); b_tmp.v = _mm512_extload_pd( b + 4, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_4.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_4.v ); c815_4.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_4.v ); b_tmp.v = _mm512_extload_pd( b + 5, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_5.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_5.v ); c815_5.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_5.v ); b_tmp.v = _mm512_extload_pd( b + 6, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_6.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_6.v ); c815_6.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_6.v ); b_tmp.v = _mm512_extload_pd( b + 7, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_7.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_7.v ); c815_7.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_7.v ); b_tmp.v = _mm512_extload_pd( b + 8, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_8.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_8.v ); c815_8.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_8.v ); b_tmp.v = _mm512_extload_pd( b + 9, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_9.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_9.v ); c815_9.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_9.v ); b_tmp.v = _mm512_extload_pd( b + 10, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_10.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_10.v ); c815_10.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_10.v ); b_tmp.v = _mm512_extload_pd( b + 11, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_11.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_11.v ); c815_11.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_11.v ); b_tmp.v = _mm512_extload_pd( b + 12, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_12.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_12.v ); c815_12.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_12.v ); b_tmp.v = _mm512_extload_pd( b + 13, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_13.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_13.v ); c815_13.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_13.v ); a += 16; b += 16; } // simulate kernel summation c007_0.v = _mm512_add_pd( c007_0.v, c007_1.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_1.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_2.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_2.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_3.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_3.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_4.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_4.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_5.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_5.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_6.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_6.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_7.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_7.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_8.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_8.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_9.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_9.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_10.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_10.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_11.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_11.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_12.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_12.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_13.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_13.v ); // if ( aux->pc != 0 ) { // // // packed // tmpc03_0.v = _mm256_load_pd( (double*)( c ) ); // tmpc47_0.v = _mm256_load_pd( (double*)( c + 4 ) ); // // tmpc03_1.v = _mm256_load_pd( (double*)( c + 8 ) ); // tmpc47_1.v = _mm256_load_pd( (double*)( c + 12 ) ); // // tmpc03_2.v = _mm256_load_pd( (double*)( c + 16 ) ); // tmpc47_2.v = _mm256_load_pd( (double*)( c + 20 ) ); // // tmpc03_3.v = _mm256_load_pd( (double*)( c + 24 ) ); // tmpc47_3.v = _mm256_load_pd( (double*)( c + 28 ) ); // // // c03_0.v = _mm256_add_pd( tmpc03_0.v, c03_0.v ); // c47_0.v = _mm256_add_pd( tmpc47_0.v, c47_0.v ); // // c03_1.v = _mm256_add_pd( tmpc03_1.v, c03_1.v ); // c47_1.v = _mm256_add_pd( tmpc47_1.v, c47_1.v ); // // c03_2.v = _mm256_add_pd( tmpc03_2.v, c03_2.v ); // c47_2.v = _mm256_add_pd( tmpc47_2.v, c47_2.v ); // // c03_3.v = _mm256_add_pd( tmpc03_3.v, c03_3.v ); // c47_3.v = _mm256_add_pd( tmpc47_3.v, c47_3.v ); // } // // // packed _mm512_store_pd( c , c007_0.v ); _mm512_store_pd( c + 8 , c815_0.v ); // _mm512_store_pd( c + 16, c007_1.v ); // _mm512_store_pd( c + 24, c815_1.v ); // // _mm512_store_pd( c + 32, c007_2.v ); // _mm512_store_pd( c + 40, c815_2.v ); // // _mm512_store_pd( c + 48, c007_3.v ); // _mm512_store_pd( c + 56, c815_3.v ); // // _mm512_store_pd( c + 64, c007_4.v ); // _mm512_store_pd( c + 72, c815_4.v ); // // _mm512_store_pd( c + 80, c007_5.v ); // _mm512_store_pd( c + 88, c815_5.v ); // // _mm512_store_pd( c + 96, c007_6.v ); // _mm512_store_pd( c + 104, c815_6.v ); // // _mm512_store_pd( c + 112, c007_7.v ); // _mm512_store_pd( c + 120, c815_7.v ); // // _mm512_store_pd( c + 128, c007_8.v ); // _mm512_store_pd( c + 136, c815_8.v ); // // _mm512_store_pd( c + 144, c007_9.v ); // _mm512_store_pd( c + 152, c815_9.v ); // // _mm512_store_pd( c + 160, c007_10.v ); // _mm512_store_pd( c + 168, c815_10.v ); // // _mm512_store_pd( c + 176, c007_11.v ); // _mm512_store_pd( c + 184, c815_11.v ); // // _mm512_store_pd( c + 192, c007_12.v ); // _mm512_store_pd( c + 200, c815_12.v ); // // _mm512_store_pd( c + 208, c007_13.v ); // _mm512_store_pd( c + 216, c815_13.v ); //printf( "ldc = %d\n", ldc ); // printf( "%lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf\n", // c007_0.d[ 0 ], c007_0.d[ 1 ], c007_0.d[ 2 ], c007_0.d[ 3 ], // c007_0.d[ 4 ], c007_0.d[ 5 ], c007_0.d[ 6 ], c007_0.d[ 7 ] ); // //printf( "%lf, %lf, %lf, %lf\n", c[1], c[ ldc + 1], c[ ldc * 2 + 1], c[ ldc * 3 + 1] ); //printf( "%lf, %lf, %lf, %lf\n", c[2], c[ ldc + 2], c[ ldc * 2 + 2], c[ ldc * 3 + 2] ); //printf( "%lf, %lf, %lf, %lf\n", c[3], c[ ldc + 3], c[ ldc * 2 + 3], c[ ldc * 3 + 3] ); //printf( "%lf, %lf, %lf, %lf\n", c[4], c[ ldc + 4], c[ ldc * 2 + 4], c[ ldc * 3 + 4] ); //printf( "%lf, %lf, %lf, %lf\n", c[5], c[ ldc + 5], c[ ldc * 2 + 5], c[ ldc * 3 + 5] ); //printf( "%lf, %lf, %lf, %lf\n", c[6], c[ ldc + 6], c[ ldc * 2 + 6], c[ ldc * 3 + 6] ); //printf( "%lf, %lf, %lf, %lf\n", c[7], c[ ldc + 7], c[ ldc * 2 + 7], c[ ldc * 3 + 7] ); }
LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE void stream_update_helmholtz_no_h2( const double* i_g1, const double* i_g2, const double* i_g3, const double* i_tm1, const double* i_tm2, const double* i_tm3, double* io_c, const double i_h1, const int i_length) { int l_n = 0; int l_trip_prolog = 0; int l_trip_stream = 0; /* init the trip counts */ stream_init( i_length, (size_t)io_c, &l_trip_prolog, &l_trip_stream ); /* run the prologue */ for ( ; l_n < l_trip_prolog; l_n++ ) { io_c[l_n] = i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]); } /* run the bulk, hopefully using streaming stores */ #if defined(__SSE3__) && defined(__AVX__) && !defined(__AVX512F__) { const __m256d vec_h1 = _mm256_broadcast_sd(&i_h1); /* we need manual unrolling as the compiler otherwise generates too many dependencies */ for ( ; l_n < l_trip_stream; l_n+=8 ) { __m256d vec_g1_1, vec_g2_1, vec_g3_1, vec_tm1_1, vec_tm2_1, vec_tm3_1; __m256d vec_g1_2, vec_g2_2, vec_g3_2, vec_tm1_2, vec_tm2_2, vec_tm3_2; vec_g1_1 = _mm256_loadu_pd(&(i_g1[l_n])); vec_tm1_1 = _mm256_loadu_pd(&(i_tm1[l_n])); vec_g1_2 = _mm256_loadu_pd(&(i_g1[l_n+4])); vec_tm1_2 = _mm256_loadu_pd(&(i_tm1[l_n+4])); vec_g1_1 = _mm256_mul_pd(vec_g1_1, vec_tm1_1); vec_g2_1 = _mm256_loadu_pd(&(i_g2[l_n])); vec_g1_2 = _mm256_mul_pd(vec_g1_2, vec_tm1_2); vec_g2_2 = _mm256_loadu_pd(&(i_g2[l_n+4])); vec_tm2_1 = _mm256_loadu_pd(&(i_tm2[l_n])); vec_g2_1 = _mm256_mul_pd(vec_g2_1, vec_tm2_1); vec_tm2_2 = _mm256_loadu_pd(&(i_tm2[l_n+4])); vec_g2_2 = _mm256_mul_pd(vec_g2_2, vec_tm2_2); vec_g3_1 = _mm256_loadu_pd(&(i_g3[l_n])); vec_tm3_1 = _mm256_loadu_pd(&(i_tm3[l_n])); vec_g3_2 = _mm256_loadu_pd(&(i_g3[l_n+4])); vec_tm3_2 = _mm256_loadu_pd(&(i_tm3[l_n+4])); vec_g3_1 = _mm256_mul_pd(vec_g3_1, vec_tm3_1); vec_g3_2 = _mm256_mul_pd(vec_g3_2, vec_tm3_2); vec_g1_1 = _mm256_add_pd(vec_g1_1, vec_g2_1); vec_g1_2 = _mm256_add_pd(vec_g1_2, vec_g2_2); vec_g1_1 = _mm256_add_pd(vec_g1_1, vec_g3_1); #ifdef DISABLE_NONTEMPORAL_STORES _mm256_store_pd( &(io_c[l_n]), _mm256_mul_pd(vec_g1_1, vec_h1) ); #else _mm256_stream_pd( &(io_c[l_n]), _mm256_mul_pd(vec_g1_1, vec_h1) ); #endif vec_g1_2 = _mm256_add_pd(vec_g1_2, vec_g3_2); #ifdef DISABLE_NONTEMPORAL_STORES _mm256_store_pd( &(io_c[l_n+4]), _mm256_mul_pd(vec_g1_2, vec_h1) ); #else _mm256_stream_pd( &(io_c[l_n+4]), _mm256_mul_pd(vec_g1_2, vec_h1) ); #endif } } #elif defined(__SSE3__) && defined(__AVX__) && defined(__AVX512F__) { const __m512d vec_h1 = _mm512_broadcastsd_pd(_mm_load_sd(&i_h1)); for ( ; l_n < l_trip_stream; l_n+=8 ) { __m512d vec_g1, vec_g2, vec_g3, vec_tm1, vec_tm2, vec_tm3; vec_g1 = _mm512_loadu_pd(&(i_g1[l_n])); vec_tm1 = _mm512_loadu_pd(&(i_tm1[l_n])); vec_g1 = _mm512_mul_pd(vec_g1, vec_tm1); vec_g2 = _mm512_loadu_pd(&(i_g2[l_n])); vec_tm2 = _mm512_loadu_pd(&(i_tm2[l_n])); vec_g2 = _mm512_mul_pd(vec_g2, vec_tm2); vec_g3 = _mm512_loadu_pd(&(i_g3[l_n])); vec_tm3 = _mm512_loadu_pd(&(i_tm3[l_n])); vec_g3 = _mm512_mul_pd(vec_g3, vec_tm3); vec_g1 = _mm512_add_pd(vec_g1, vec_g2); vec_g1 = _mm512_add_pd(vec_g1, vec_g3); #ifdef DISABLE_NONTEMPORAL_STORES _mm512_store_pd( &(io_c[l_n]), _mm512_mul_pd(vec_g1, vec_h1) ); #else _mm512_stream_pd( &(io_c[l_n]), _mm512_mul_pd(vec_g1, vec_h1) ); #endif } } #else for ( ; l_n < l_trip_stream; l_n++ ) { io_c[l_n] = i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]); } #endif /* run the epilogue */ for ( ; l_n < i_length; l_n++ ) { io_c[l_n] = i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]); } }
inline short_vec<double, 8> operator+(const short_vec<double, 8>& other) const { return short_vec<double, 8>( _mm512_add_pd(val, other.val)); }
inline void operator+=(const short_vec<double, 8>& other) { val = _mm512_add_pd(val, other.val); }