void SpMM(Csr<ValueType>* m1, Csr<ValueType>* m2, int num_buckets) { vector<FastHash<int, ValueType>* > result_map(m1->num_rows); for (auto& v : result_map) { v = new FastHash<int, ValueType>(num_buckets); } cout << "Starting SpMM..." << endl; float res = 0; double before = rtclock(); for(int i=0;i<m1->num_rows;i++) { for(int j=m1->rows[i];j<m1->rows[i+1];j++) { int cola = m1->cols[j]; __m512d a = _mm512_set1_pd(m1->vals[j]); for(int k=m2->rows[cola];k<m2->rows[cola] + m2->row_lens[cola];k+=16) { __m512d *pb1 = (__m512d *)(&(m2->vals[k])); __m512d *pb2 = (__m512d *)(&(m2->vals[k]) + 8); __m512i *pcols = (__m512i *)(&(m2->cols[k])); __m512d c1 = _mm512_mul_pd(a, *pb1); __m512d c2 = _mm512_mul_pd(a, *pb2); for(int x=0;x<8;x++) { int col = ((int *)pcols)[x]; if (col == -1) { continue; } ValueType val = ((ValueType *)(&c1))[x]; result_map[i]->Reduce(col, val); res += val; } for (int x = 0; x < 8; ++x) { int col = ((int *)pcols)[x+8]; if (col == -1) { continue; } ValueType val = ((ValueType *)(&c2))[x]; result_map[i]->Reduce(col, val); res += val; } } } } double after = rtclock(); cout << "res: " << res << endl; cout << RED << "[****Result****] ========> *SIMD Naive* time: " << after - before << " secs." << RESET << endl; for (auto& v : result_map) { delete v; } }
double vNormalIntegral(double b) { __declspec(align(64)) __m512d vec_cf0, vec_cf1, vec_cf2, vec_s, vec_stp, vec_exp; //NN/2-1 has to be the multiple of 8 //NN = (8*LV+1)*2, LV = 20 -> NN = 322 //const int NN = 322; const int vecsize = 8; const int nCal = (NN/2-1)/vecsize; //const int left = NN%vecsize; double a = 0.0f; double s, h, sum = 0.0f; h = (b-a)/NN; // add in the first few terms sum += exp(-a*a/2.0) + 4.0*exp(-(a+h)*(a+h)/2.0); // and the last one sum += exp(-b*b/2.0); vec_cf0 = _mm512_set1_pd(a); vec_cf1 = _mm512_set1_pd(2*h); vec_cf2 = _mm512_set1_pd(-0.5); vec_s = _mm512_set_pd(8,7,6,5,4,3,2,1);//vectorize vec_s = _mm512_mul_pd(vec_s, vec_cf1);//(16h,14h,..,2h) vec_s = _mm512_add_pd(vec_cf0, vec_s);//(a+16h,..,a+2h) vec_stp = _mm512_set1_pd(2*h*vecsize-h); vec_cf0 = _mm512_set1_pd(h); for (int i = 0; i < nCal; ++i){ vec_exp = _mm512_mul_pd(vec_s, vec_s); vec_exp = _mm512_mul_pd(vec_exp, vec_cf2); vec_cf1 = _mm512_exp_pd(vec_exp);//vec_cf1->sum sum += 2.0*_mm512_reduce_add_pd(vec_cf1); vec_s = _mm512_add_pd(vec_s, vec_cf0);//s+=h vec_exp = _mm512_mul_pd(vec_s, vec_s); vec_exp = _mm512_mul_pd(vec_exp, vec_cf2); vec_cf1 = _mm512_exp_pd(vec_exp); sum += 4.0*_mm512_reduce_add_pd(vec_cf1); vec_s = _mm512_add_pd(vec_s, vec_stp); } sum = 0.5*sqrt(2*PI) + h*sum/3.0; return sum; }
int main(int argc, char** argv) { /* variable declaration */ DB(DB_LVL, "declaration"); DT * memA_t0, *memA_t1, *memA_t2, *memA_t3; DT * memB_t0, *memB_t1, *memB_t2, *memB_t3; DT * memO_t0, *memO_t1, *memO_t2, *memO_t3; int reps, size; int samples; int tid; int i, p, r, bytes, elems; int bytes_min, bytes_max; int elems_min, elems_max; double func_overhead; double t_start, t_end; double t_min, c_min; double alpha = 0.5; DB(DB_LVL, SEPERATOR); /* initialization */ DB(DB_LVL, "intialization"); samples = 3; bytes_min = 1024, bytes_max = 1024*32; /* [1KB, 32KB] */ elems_min = bytes_min/sizeof(DT), elems_max = bytes_max/sizeof(DT); /* the number of elements */ reps = 40000; DB(DB_LVL, SEPERATOR); /* omp environment */ const int nthreads = argc > 1 ? atoi(argv[1]) : 4; fprintf(stderr , "nthreads= %d\n", nthreads); omp_set_num_threads(nthreads); /* iteration */ DB(DB_LVL, "measurement"); for(elems=elems_min, bytes=bytes_min; elems<=elems_max; elems=elems+elems_min, bytes=bytes+bytes_min) { memA_t0 = (DT *)_mm_malloc(bytes_max, 64); memA_t1 = (DT *)_mm_malloc(bytes_max, 64); memA_t2 = (DT *)_mm_malloc(bytes_max, 64); memA_t3 = (DT *)_mm_malloc(bytes_max, 64); memB_t0 = (DT *)_mm_malloc(bytes_max, 64); memB_t1 = (DT *)_mm_malloc(bytes_max, 64); memB_t2 = (DT *)_mm_malloc(bytes_max, 64); memB_t3 = (DT *)_mm_malloc(bytes_max, 64); memO_t0 = (DT *)_mm_malloc(bytes_max, 64); memO_t1 = (DT *)_mm_malloc(bytes_max, 64); memO_t2 = (DT *)_mm_malloc(bytes_max, 64); memO_t3 = (DT *)_mm_malloc(bytes_max, 64); /* initialization a local space */ fill(memA_t0, elems, 1.0); fill(memA_t1, elems, 2.0); fill(memA_t2, elems, 3.0); fill(memA_t3, elems, 4.0); fill(memB_t0, elems, 1.0); fill(memB_t1, elems, 2.0); fill(memB_t2, elems, 3.0); fill(memB_t3, elems, 4.0); fill(memO_t0, elems, 1.0); fill(memO_t1, elems, 2.0); fill(memO_t2, elems, 3.0); fill(memO_t3, elems, 4.0); /* measurement */ t_min = 0.0f; c_min = 0.0f; DT ret_t0 = 0.0; DT ret_t1 = 0.0; DT ret_t2 = 0.0; DT ret_t3 = 0.0; #ifdef SAXPY2 #define Z _z #else #define Z _z #endif for(p=0; p<samples; p++) { __m512d *_x, *_y, *_z; #pragma omp parallel private(_x,_y,_z) default(shared) { int tid; tid = omp_get_thread_num(); switch(tid) { case 0: _x = (__m512d*)memA_t0; _y = (__m512d*)memB_t0; _z = (__m512d*)memO_t0; break; case 1: _x = (__m512d*)memA_t1; _y = (__m512d*)memB_t1; _z = (__m512d*)memO_t1; break; case 2: _x = (__m512d*)memA_t2; _y = (__m512d*)memB_t2; _z = (__m512d*)memO_t2; break; case 3: _x = (__m512d*)memA_t3; _y = (__m512d*)memB_t3; _z = (__m512d*)memO_t3; break; default: assert(0); } #pragma omp barrier if(p==(samples-1)) t_start = timer(); int r; for(r=0; r<reps; r++) { asm("#t0-beg"); #if 0 double *memO_t0 = (double*)Z; const double *memA_t0 = (double*)_x; const double *memB_t0 = (double*)_y; #pragma vector aligned for(i=0; i<elems; i=i+1) { //ret_t0 += mem_t0[i]; memO_t0[i] = alpha * memA_t0[i] + memB_t0[i]; } memO_t0[0] = memO_t0[0] * 0.1; // to avoid overflow and optimizations #else const int cnts = elems >> 3; const __m512d _a = _mm512_set1_pd(alpha); int ib; for (ib = 0; ib < cnts; ib += 8*8) { Z[ib+0] = _mm512_add_pd(_y[ib+0], _mm512_mul_pd(_a,_x[ib+0])); Z[ib+1] = _mm512_add_pd(_y[ib+1], _mm512_mul_pd(_a,_x[ib+1])); Z[ib+2] = _mm512_add_pd(_y[ib+2], _mm512_mul_pd(_a,_x[ib+2])); Z[ib+3] = _mm512_add_pd(_y[ib+3], _mm512_mul_pd(_a,_x[ib+3])); Z[ib+4] = _mm512_add_pd(_y[ib+4], _mm512_mul_pd(_a,_x[ib+4])); Z[ib+5] = _mm512_add_pd(_y[ib+5], _mm512_mul_pd(_a,_x[ib+5])); Z[ib+6] = _mm512_add_pd(_y[ib+6], _mm512_mul_pd(_a,_x[ib+6])); Z[ib+7] = _mm512_add_pd(_y[ib+7], _mm512_mul_pd(_a,_x[ib+7])); Z[ib+8+0] = _mm512_add_pd(_y[ib+8+0], _mm512_mul_pd(_a,_x[ib+8+0])); Z[ib+8+1] = _mm512_add_pd(_y[ib+8+1], _mm512_mul_pd(_a,_x[ib+8+1])); Z[ib+8+2] = _mm512_add_pd(_y[ib+8+2], _mm512_mul_pd(_a,_x[ib+8+2])); Z[ib+8+3] = _mm512_add_pd(_y[ib+8+3], _mm512_mul_pd(_a,_x[ib+8+3])); Z[ib+8+4] = _mm512_add_pd(_y[ib+8+4], _mm512_mul_pd(_a,_x[ib+8+4])); Z[ib+8+5] = _mm512_add_pd(_y[ib+8+5], _mm512_mul_pd(_a,_x[ib+8+5])); Z[ib+8+6] = _mm512_add_pd(_y[ib+8+6], _mm512_mul_pd(_a,_x[ib+8+6])); Z[ib+8+7] = _mm512_add_pd(_y[ib+8+7], _mm512_mul_pd(_a,_x[ib+8+7])); Z[ib+16+0] = _mm512_add_pd(_y[ib+16+0], _mm512_mul_pd(_a,_x[ib+16+0])); Z[ib+16+1] = _mm512_add_pd(_y[ib+16+1], _mm512_mul_pd(_a,_x[ib+16+1])); Z[ib+16+2] = _mm512_add_pd(_y[ib+16+2], _mm512_mul_pd(_a,_x[ib+16+2])); Z[ib+16+3] = _mm512_add_pd(_y[ib+16+3], _mm512_mul_pd(_a,_x[ib+16+3])); Z[ib+16+4] = _mm512_add_pd(_y[ib+16+4], _mm512_mul_pd(_a,_x[ib+16+4])); Z[ib+16+5] = _mm512_add_pd(_y[ib+16+5], _mm512_mul_pd(_a,_x[ib+16+5])); Z[ib+16+6] = _mm512_add_pd(_y[ib+16+6], _mm512_mul_pd(_a,_x[ib+16+6])); Z[ib+16+7] = _mm512_add_pd(_y[ib+16+7], _mm512_mul_pd(_a,_x[ib+16+7])); Z[ib+24+0] = _mm512_add_pd(_y[ib+24+0], _mm512_mul_pd(_a,_x[ib+24+0])); Z[ib+24+1] = _mm512_add_pd(_y[ib+24+1], _mm512_mul_pd(_a,_x[ib+24+1])); Z[ib+24+2] = _mm512_add_pd(_y[ib+24+2], _mm512_mul_pd(_a,_x[ib+24+2])); Z[ib+24+3] = _mm512_add_pd(_y[ib+24+3], _mm512_mul_pd(_a,_x[ib+24+3])); Z[ib+24+4] = _mm512_add_pd(_y[ib+24+4], _mm512_mul_pd(_a,_x[ib+24+4])); Z[ib+24+5] = _mm512_add_pd(_y[ib+24+5], _mm512_mul_pd(_a,_x[ib+24+5])); Z[ib+24+6] = _mm512_add_pd(_y[ib+24+6], _mm512_mul_pd(_a,_x[ib+24+6])); Z[ib+24+7] = _mm512_add_pd(_y[ib+24+7], _mm512_mul_pd(_a,_x[ib+24+7])); Z[ib+32+0] = _mm512_add_pd(_y[ib+32+0], _mm512_mul_pd(_a,_x[ib+32+0])); Z[ib+32+1] = _mm512_add_pd(_y[ib+32+1], _mm512_mul_pd(_a,_x[ib+32+1])); Z[ib+32+2] = _mm512_add_pd(_y[ib+32+2], _mm512_mul_pd(_a,_x[ib+32+2])); Z[ib+32+3] = _mm512_add_pd(_y[ib+32+3], _mm512_mul_pd(_a,_x[ib+32+3])); Z[ib+32+4] = _mm512_add_pd(_y[ib+32+4], _mm512_mul_pd(_a,_x[ib+32+4])); Z[ib+32+5] = _mm512_add_pd(_y[ib+32+5], _mm512_mul_pd(_a,_x[ib+32+5])); Z[ib+32+6] = _mm512_add_pd(_y[ib+32+6], _mm512_mul_pd(_a,_x[ib+32+6])); Z[ib+32+7] = _mm512_add_pd(_y[ib+32+7], _mm512_mul_pd(_a,_x[ib+32+7])); Z[ib+40+0] = _mm512_add_pd(_y[ib+40+0], _mm512_mul_pd(_a,_x[ib+40+0])); Z[ib+40+1] = _mm512_add_pd(_y[ib+40+1], _mm512_mul_pd(_a,_x[ib+40+1])); Z[ib+40+2] = _mm512_add_pd(_y[ib+40+2], _mm512_mul_pd(_a,_x[ib+40+2])); Z[ib+40+3] = _mm512_add_pd(_y[ib+40+3], _mm512_mul_pd(_a,_x[ib+40+3])); Z[ib+40+4] = _mm512_add_pd(_y[ib+40+4], _mm512_mul_pd(_a,_x[ib+40+4])); Z[ib+40+5] = _mm512_add_pd(_y[ib+40+5], _mm512_mul_pd(_a,_x[ib+40+5])); Z[ib+40+6] = _mm512_add_pd(_y[ib+40+6], _mm512_mul_pd(_a,_x[ib+40+6])); Z[ib+40+7] = _mm512_add_pd(_y[ib+40+7], _mm512_mul_pd(_a,_x[ib+40+7])); Z[ib+48+0] = _mm512_add_pd(_y[ib+48+0], _mm512_mul_pd(_a,_x[ib+48+0])); Z[ib+48+1] = _mm512_add_pd(_y[ib+48+1], _mm512_mul_pd(_a,_x[ib+48+1])); Z[ib+48+2] = _mm512_add_pd(_y[ib+48+2], _mm512_mul_pd(_a,_x[ib+48+2])); Z[ib+48+3] = _mm512_add_pd(_y[ib+48+3], _mm512_mul_pd(_a,_x[ib+48+3])); Z[ib+48+4] = _mm512_add_pd(_y[ib+48+4], _mm512_mul_pd(_a,_x[ib+48+4])); Z[ib+48+5] = _mm512_add_pd(_y[ib+48+5], _mm512_mul_pd(_a,_x[ib+48+5])); Z[ib+48+6] = _mm512_add_pd(_y[ib+48+6], _mm512_mul_pd(_a,_x[ib+48+6])); Z[ib+48+7] = _mm512_add_pd(_y[ib+48+7], _mm512_mul_pd(_a,_x[ib+48+7])); Z[ib+56+0] = _mm512_add_pd(_y[ib+56+0], _mm512_mul_pd(_a,_x[ib+56+0])); Z[ib+56+1] = _mm512_add_pd(_y[ib+56+1], _mm512_mul_pd(_a,_x[ib+56+1])); Z[ib+56+2] = _mm512_add_pd(_y[ib+56+2], _mm512_mul_pd(_a,_x[ib+56+2])); Z[ib+56+3] = _mm512_add_pd(_y[ib+56+3], _mm512_mul_pd(_a,_x[ib+56+3])); Z[ib+56+4] = _mm512_add_pd(_y[ib+56+4], _mm512_mul_pd(_a,_x[ib+56+4])); Z[ib+56+5] = _mm512_add_pd(_y[ib+56+5], _mm512_mul_pd(_a,_x[ib+56+5])); Z[ib+56+6] = _mm512_add_pd(_y[ib+56+6], _mm512_mul_pd(_a,_x[ib+56+6])); Z[ib+56+7] = _mm512_add_pd(_y[ib+56+7], _mm512_mul_pd(_a,_x[ib+56+7])); } #endif asm("#t0-end"); } } if(p==(samples-1)) t_end = timer(); } t_min = (t_end - t_start)/reps; printf("%lf,%lf,%lf,%lf\n", ret_t0, ret_t1, ret_t2, ret_t3); SAVE_DATA("%lf\t", 3*nthreads*bytes/t_min); printf("cbw: %lf\t elems= %d mem_tot= %d\n", 3*nthreads*bytes/t_min, elems/8, 3*elems*sizeof(DT)*nthreads); if(memA_t0!=NULL) _mm_free(memA_t0); if(memA_t1!=NULL) _mm_free(memA_t1); if(memA_t2!=NULL) _mm_free(memA_t2); if(memA_t3!=NULL) _mm_free(memA_t3); if(memB_t0!=NULL) _mm_free(memB_t0); if(memB_t1!=NULL) _mm_free(memB_t1); if(memB_t2!=NULL) _mm_free(memB_t2); if(memB_t3!=NULL) _mm_free(memB_t3); if(memO_t0!=NULL) _mm_free(memO_t0); if(memO_t1!=NULL) _mm_free(memO_t1); if(memO_t2!=NULL) _mm_free(memO_t2); if(memO_t3!=NULL) _mm_free(memO_t3); } DB(DB_LVL, SEPERATOR); /* post-process */ DB(DB_LVL, "post-process"); DB(DB_LVL, SEPERATOR); }
void newviewGTRGAMMA_MIC(int tipCase, double *x1, double *x2, double *x3, double *extEV, double *tipVector, int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling) { __m512d minlikelihood_MIC = _mm512_set1_pd(PLL_MINLIKELIHOOD); __m512d twotothe256_MIC = _mm512_set1_pd(PLL_TWOTOTHE256); __m512i absMask_MIC = _mm512_set1_epi64(0x7fffffffffffffffULL); int addScale = 0; double aEV[64] __attribute__((align(PLL_BYTE_ALIGNMENT))); #pragma ivdep for (int l = 0; l < 64; ++l) { aEV[l] = extEV[(l / 16) * 4 + (l % 4)]; } switch(tipCase) { case PLL_TIP_TIP: { /* multiply all possible tip state vectors with the respective P-matrices */ double umpX1[256] __attribute__((align(PLL_BYTE_ALIGNMENT))); double umpX2[256] __attribute__((align(PLL_BYTE_ALIGNMENT))); for(int k = 0; k < 256; ++k) { umpX1[k] = 0.0; umpX2[k] = 0.0; } for(int i = 0; i < maxStateValue; ++i) { for(int l = 0; l < states; ++l) { #pragma ivdep for(int k = 0; k < span; ++k) { umpX1[16 * i + k] += tipVector[i * 4 + l] * left[k * 4 + l]; umpX2[16 * i + k] += tipVector[i * 4 + l] * right[k * 4 + l]; } } } double auX[64] __attribute__((align(64))); for(int i = 0; i < n; ++i) { _mm_prefetch((const char*) (const char*) &x3[span*(i+8)], _MM_HINT_ET1); _mm_prefetch((const char*) &x3[span*(i+8) + 8], _MM_HINT_ET1); _mm_prefetch((const char*) &x3[span*(i+1)], _MM_HINT_ET0); _mm_prefetch((const char*) &x3[span*(i+1) + 8], _MM_HINT_ET0); const double *uX1 = &umpX1[16 * tipX1[i]]; const double *uX2 = &umpX2[16 * tipX2[i]]; double uX[16] __attribute__((align(PLL_BYTE_ALIGNMENT))); double* v = &x3[i * 16]; #pragma ivdep #pragma vector aligned for(int l = 0; l < 16; ++l) { uX[l] = uX1[l] * uX2[l]; v[l] = 0.; } mic_broadcast16x64(uX, auX); for (int j = 0; j < 4; ++j) { #pragma ivdep #pragma vector aligned #pragma vector nontemporal for(int k = 0; k < 16; ++k) { v[k] += auX[j*16 + k] * aEV[j*16 + k]; } } // init scaling counter for the site if (!fastScaling) ex3[i] = 0; } // sites loop } break; case PLL_TIP_INNER: { /* we do analogous pre-computations as above, with the only difference that we now do them only for one tip vector */ double umpX1[256] __attribute__((align(PLL_BYTE_ALIGNMENT))); /* precompute P and left tip vector product */ for(int k = 0; k < 256; ++k) { umpX1[k] = 0.0; } for(int i = 0; i < 16; ++i) { for(int l = 0; l < 4; ++l) { #pragma ivdep for(int k = 0; k < 16; ++k) { umpX1[16 * i + k] += tipVector[i * 4 + l] * left[k * 4 + l]; } } } // re-arrange right matrix for better memory layout double aRight[64] __attribute__((align(PLL_BYTE_ALIGNMENT))); for(int j = 0; j < 4; j++) { for(int l = 0; l < 16; l++) { aRight[j*16 + l] = right[l*4 + j]; } } for (int i = 0; i < n; i++) { _mm_prefetch((const char*) &x2[span*(i+16)], _MM_HINT_T1); _mm_prefetch((const char*) &x2[span*(i+16) + 8], _MM_HINT_T1); _mm_prefetch((const char*) &x3[span*(i+16)], _MM_HINT_ET1); _mm_prefetch((const char*) &x3[span*(i+16) + 8], _MM_HINT_ET1); _mm_prefetch((const char*) &x2[span*(i+1)], _MM_HINT_T0); _mm_prefetch((const char*) &x2[span*(i+1) + 8], _MM_HINT_T0); _mm_prefetch((const char*) &x3[span*(i+1)], _MM_HINT_ET0); _mm_prefetch((const char*) &x3[span*(i+1) + 8], _MM_HINT_ET0); /* access pre-computed value based on the raw sequence data tipX1 that is used as an index */ double* uX1 = &umpX1[span * tipX1[i]]; double uX2[16] __attribute__((align(PLL_BYTE_ALIGNMENT))); double uX[16] __attribute__((align(PLL_BYTE_ALIGNMENT))); #pragma vector aligned for(int l = 0; l < 16; ++l) { uX2[l] = 0.; } double aV2[64] __attribute__((align(PLL_BYTE_ALIGNMENT))); const double* v2 = &(x2[16 * i]); mic_broadcast16x64(v2, aV2); for(int j = 0; j < 4; j++) { #pragma ivdep #pragma vector aligned for(int l = 0; l < 16; l++) { uX2[l] += aV2[j*16 + l] * aRight[j*16 + l]; } } double* v3 = &(x3[span * i]); #pragma ivdep #pragma vector aligned for(int l = 0; l < 16; ++l) { uX[l] = uX1[l] * uX2[l]; v3[l] = 0.; } double auX[64] __attribute__((align(PLL_BYTE_ALIGNMENT))); mic_broadcast16x64(uX, auX); for (int j = 0; j < 4; ++j) { #pragma ivdep #pragma vector aligned for(int k = 0; k < 16; ++k) { v3[k] += auX[j*16 + k] * aEV[j*16 + k]; } } __m512d t1 = _mm512_load_pd(&v3[0]); t1 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t1), absMask_MIC)); double vmax1 = _mm512_reduce_gmax_pd(t1); __m512d t2 = _mm512_load_pd(&v3[8]); t2 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t2), absMask_MIC)); double vmax2 = _mm512_reduce_gmax_pd(t2); if(vmax1 < PLL_MINLIKELIHOOD && vmax2 < PLL_MINLIKELIHOOD) { t1 = _mm512_mul_pd(t1, twotothe256_MIC); _mm512_store_pd(&v3[0], t1); t2 = _mm512_mul_pd(t2, twotothe256_MIC); _mm512_store_pd(&v3[8], t2); if(!fastScaling) ex3[i] += 1; else addScale += wgt[i]; } } // site loop } break; case PLL_INNER_INNER: { /* same as above, without pre-computations */ // re-arrange right matrix for better memory layout double aLeft[64] __attribute__((align(PLL_BYTE_ALIGNMENT))); double aRight[64] __attribute__((align(PLL_BYTE_ALIGNMENT))); for(int j = 0; j < 4; j++) { for(int l = 0; l < 16; l++) { aLeft[j*16 + l] = left[l*4 + j]; aRight[j*16 + l] = right[l*4 + j]; } } for (int i = 0; i < n; i++) { _mm_prefetch((const char*) &x1[span*(i+8)], _MM_HINT_T1); _mm_prefetch((const char*) &x1[span*(i+8) + 8], _MM_HINT_T1); _mm_prefetch((const char*) &x2[span*(i+8)], _MM_HINT_T1); _mm_prefetch((const char*) &x2[span*(i+8) + 8], _MM_HINT_T1); _mm_prefetch((const char*) &x3[span*(i+8)], _MM_HINT_ET1); _mm_prefetch((const char*) &x3[span*(i+8) + 8], _MM_HINT_ET1); _mm_prefetch((const char*) &x1[span*(i+1)], _MM_HINT_T0); _mm_prefetch((const char*) &x1[span*(i+1) + 8], _MM_HINT_T0); _mm_prefetch((const char*) &x2[span*(i+1)], _MM_HINT_T0); _mm_prefetch((const char*) &x2[span*(i+1) + 8], _MM_HINT_T0); _mm_prefetch((const char*) &x3[span*(i+1)], _MM_HINT_ET0); _mm_prefetch((const char*) &x3[span*(i+1) + 8], _MM_HINT_ET0); double uX1[16] __attribute__((align(64))); double uX2[16] __attribute__((align(64))); double uX[16] __attribute__((align(64))); for(int l = 0; l < 16; l++) { uX1[l] = 0.; uX2[l] = 0.; } double aV1[64] __attribute__((align(64))); double aV2[64] __attribute__((align(64))); const double* v1 = &(x1[span * i]); const double* v2 = &(x2[span * i]); mic_broadcast16x64(v1, aV1); mic_broadcast16x64(v2, aV2); for(int j = 0; j < 4; j++) { #pragma ivdep #pragma vector aligned for(int l = 0; l < 16; l++) { uX1[l] += aV1[j*16 + l] * aLeft[j*16 + l]; uX2[l] += aV2[j*16 + l] * aRight[j*16 + l]; } } double* v3 = &(x3[span * i]); #pragma ivdep #pragma vector aligned for(int l = 0; l < 16; ++l) { uX[l] = uX1[l] * uX2[l]; v3[l] = 0.; } double auX[64] __attribute__((align(64))); mic_broadcast16x64(uX, auX); for(int j = 0; j < 4; ++j) { #pragma ivdep #pragma vector aligned for(int k = 0; k < 16; ++k) { v3[k] += auX[j*16 + k] * aEV[j*16 + k]; } } __m512d t1 = _mm512_load_pd(&v3[0]); t1 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t1), absMask_MIC)); double vmax1 = _mm512_reduce_gmax_pd(t1); __m512d t2 = _mm512_load_pd(&v3[8]); t2 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t2), absMask_MIC)); double vmax2 = _mm512_reduce_gmax_pd(t2); if(vmax1 < PLL_MINLIKELIHOOD && vmax2 < PLL_MINLIKELIHOOD) { t1 = _mm512_mul_pd(t1, twotothe256_MIC); _mm512_store_pd(&v3[0], t1); t2 = _mm512_mul_pd(t2, twotothe256_MIC); _mm512_store_pd(&v3[8], t2); if(!fastScaling) ex3[i] += 1; else addScale += wgt[i]; } } } break; default: // assert(0); break; } /* as above, increment the global counter that counts scaling multiplications by the scaling multiplications carried out for computing the likelihood array at node p */ if (fastScaling) { *scalerIncrement = addScale; } }
LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE void stream_update_helmholtz_no_h2( const double* i_g1, const double* i_g2, const double* i_g3, const double* i_tm1, const double* i_tm2, const double* i_tm3, double* io_c, const double i_h1, const int i_length) { int l_n = 0; int l_trip_prolog = 0; int l_trip_stream = 0; /* init the trip counts */ stream_init( i_length, (size_t)io_c, &l_trip_prolog, &l_trip_stream ); /* run the prologue */ for ( ; l_n < l_trip_prolog; l_n++ ) { io_c[l_n] = i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]); } /* run the bulk, hopefully using streaming stores */ #if defined(__SSE3__) && defined(__AVX__) && !defined(__AVX512F__) { const __m256d vec_h1 = _mm256_broadcast_sd(&i_h1); /* we need manual unrolling as the compiler otherwise generates too many dependencies */ for ( ; l_n < l_trip_stream; l_n+=8 ) { __m256d vec_g1_1, vec_g2_1, vec_g3_1, vec_tm1_1, vec_tm2_1, vec_tm3_1; __m256d vec_g1_2, vec_g2_2, vec_g3_2, vec_tm1_2, vec_tm2_2, vec_tm3_2; vec_g1_1 = _mm256_loadu_pd(&(i_g1[l_n])); vec_tm1_1 = _mm256_loadu_pd(&(i_tm1[l_n])); vec_g1_2 = _mm256_loadu_pd(&(i_g1[l_n+4])); vec_tm1_2 = _mm256_loadu_pd(&(i_tm1[l_n+4])); vec_g1_1 = _mm256_mul_pd(vec_g1_1, vec_tm1_1); vec_g2_1 = _mm256_loadu_pd(&(i_g2[l_n])); vec_g1_2 = _mm256_mul_pd(vec_g1_2, vec_tm1_2); vec_g2_2 = _mm256_loadu_pd(&(i_g2[l_n+4])); vec_tm2_1 = _mm256_loadu_pd(&(i_tm2[l_n])); vec_g2_1 = _mm256_mul_pd(vec_g2_1, vec_tm2_1); vec_tm2_2 = _mm256_loadu_pd(&(i_tm2[l_n+4])); vec_g2_2 = _mm256_mul_pd(vec_g2_2, vec_tm2_2); vec_g3_1 = _mm256_loadu_pd(&(i_g3[l_n])); vec_tm3_1 = _mm256_loadu_pd(&(i_tm3[l_n])); vec_g3_2 = _mm256_loadu_pd(&(i_g3[l_n+4])); vec_tm3_2 = _mm256_loadu_pd(&(i_tm3[l_n+4])); vec_g3_1 = _mm256_mul_pd(vec_g3_1, vec_tm3_1); vec_g3_2 = _mm256_mul_pd(vec_g3_2, vec_tm3_2); vec_g1_1 = _mm256_add_pd(vec_g1_1, vec_g2_1); vec_g1_2 = _mm256_add_pd(vec_g1_2, vec_g2_2); vec_g1_1 = _mm256_add_pd(vec_g1_1, vec_g3_1); #ifdef DISABLE_NONTEMPORAL_STORES _mm256_store_pd( &(io_c[l_n]), _mm256_mul_pd(vec_g1_1, vec_h1) ); #else _mm256_stream_pd( &(io_c[l_n]), _mm256_mul_pd(vec_g1_1, vec_h1) ); #endif vec_g1_2 = _mm256_add_pd(vec_g1_2, vec_g3_2); #ifdef DISABLE_NONTEMPORAL_STORES _mm256_store_pd( &(io_c[l_n+4]), _mm256_mul_pd(vec_g1_2, vec_h1) ); #else _mm256_stream_pd( &(io_c[l_n+4]), _mm256_mul_pd(vec_g1_2, vec_h1) ); #endif } } #elif defined(__SSE3__) && defined(__AVX__) && defined(__AVX512F__) { const __m512d vec_h1 = _mm512_broadcastsd_pd(_mm_load_sd(&i_h1)); for ( ; l_n < l_trip_stream; l_n+=8 ) { __m512d vec_g1, vec_g2, vec_g3, vec_tm1, vec_tm2, vec_tm3; vec_g1 = _mm512_loadu_pd(&(i_g1[l_n])); vec_tm1 = _mm512_loadu_pd(&(i_tm1[l_n])); vec_g1 = _mm512_mul_pd(vec_g1, vec_tm1); vec_g2 = _mm512_loadu_pd(&(i_g2[l_n])); vec_tm2 = _mm512_loadu_pd(&(i_tm2[l_n])); vec_g2 = _mm512_mul_pd(vec_g2, vec_tm2); vec_g3 = _mm512_loadu_pd(&(i_g3[l_n])); vec_tm3 = _mm512_loadu_pd(&(i_tm3[l_n])); vec_g3 = _mm512_mul_pd(vec_g3, vec_tm3); vec_g1 = _mm512_add_pd(vec_g1, vec_g2); vec_g1 = _mm512_add_pd(vec_g1, vec_g3); #ifdef DISABLE_NONTEMPORAL_STORES _mm512_store_pd( &(io_c[l_n]), _mm512_mul_pd(vec_g1, vec_h1) ); #else _mm512_stream_pd( &(io_c[l_n]), _mm512_mul_pd(vec_g1, vec_h1) ); #endif } } #else for ( ; l_n < l_trip_stream; l_n++ ) { io_c[l_n] = i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]); } #endif /* run the epilogue */ for ( ; l_n < i_length; l_n++ ) { io_c[l_n] = i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]); } }
LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE void stream_vector_compscale( const double* i_a, const double* i_b, double* io_c, const int i_length) { int l_n = 0; int l_trip_prolog = 0; int l_trip_stream = 0; /* init the trip counts */ stream_init( i_length, (size_t)io_c, &l_trip_prolog, &l_trip_stream ); /* run the prologue */ for ( ; l_n < l_trip_prolog; l_n++ ) { io_c[l_n] = i_a[l_n]*i_b[l_n]; } /* run the bulk, hopefully using streaming stores */ #if defined(__SSE3__) && defined(__AVX__) && !defined(__AVX512F__) { /* we need manual unrolling as the compiler otherwise generates too many dependencies */ for ( ; l_n < l_trip_stream; l_n+=8 ) { __m256d vec_a_1, vec_b_1; __m256d vec_a_2, vec_b_2; vec_a_1 = _mm256_loadu_pd(&(i_a[l_n])); vec_a_2 = _mm256_loadu_pd(&(i_a[l_n+4])); vec_b_1 = _mm256_loadu_pd(&(i_b[l_n])); vec_b_2 = _mm256_loadu_pd(&(i_b[l_n+4])); #ifdef DISABLE_NONTEMPORAL_STORES _mm256_store_pd( &(io_c[l_n]), _mm256_mul_pd( vec_a_1, vec_b_1 ) ); _mm256_store_pd( &(io_c[l_n+4]), _mm256_mul_pd( vec_a_2, vec_b_2 ) ); #else _mm256_stream_pd( &(io_c[l_n]), _mm256_mul_pd( vec_a_1, vec_b_1 ) ); _mm256_stream_pd( &(io_c[l_n+4]), _mm256_mul_pd( vec_a_2, vec_b_2 ) ); #endif } } #elif defined(__SSE3__) && defined(__AVX__) && defined(__AVX512F__) { for ( ; l_n < l_trip_stream; l_n+=8 ) { __m512d vec_a, vec_b; vec_a = _mm512_loadu_pd(&(i_a[l_n])); vec_b = _mm512_loadu_pd(&(i_b[l_n])); #ifdef DISABLE_NONTEMPORAL_STORES _mm512_store_pd( &(io_c[l_n]), _mm512_mul_pd( vec_a, vec_b ) ); #else _mm512_stream_pd( &(io_c[l_n]), _mm512_mul_pd( vec_a, vec_b ) ); #endif } } #else for ( ; l_n < l_trip_stream; l_n++ ) { io_c[l_n] = i_a[l_n]*i_b[l_n]; } #endif /* run the epilogue */ for ( ; l_n < i_length; l_n++ ) { io_c[l_n] = i_a[l_n]*i_b[l_n]; } }
inline short_vec<double, 8> operator*(const short_vec<double, 8>& other) const { return short_vec<double, 8>( _mm512_mul_pd(val, other.val)); }
inline void operator*=(const short_vec<double, 8>& other) { val = _mm512_mul_pd(val, other.val); }