void mad12(int num, DT* data, int repeat, DT vv1, DT vv2) { int gid, j; #ifndef MIC #pragma omp parallel for for (gid = 0; gid<num; gid++) { register DT s = (DT)(0.999f); register DT v1 = vv1; register DT v2 = vv2; for (j=0; j<repeat; ++j) { MADD1_MOP2 } data[gid] = s; } #else #pragma omp parallel for for (gid = 0; gid<num; gid=gid+STEP) { __m512d s = _mm512_set1_pd(0.999f); __m512d v1 = _mm512_set1_pd(vv1); __m512d v2 = _mm512_set1_pd(vv2); for (j=0; j<repeat; ++j) { MADD1_MOP2 } _mm512_store_pd(&(data[gid]), s); } #endif return ; }
double vNormalIntegral(double b) { __declspec(align(64)) __m512d vec_cf0, vec_cf1, vec_cf2, vec_s, vec_stp, vec_exp; //NN/2-1 has to be the multiple of 8 //NN = (8*LV+1)*2, LV = 20 -> NN = 322 //const int NN = 322; const int vecsize = 8; const int nCal = (NN/2-1)/vecsize; //const int left = NN%vecsize; double a = 0.0f; double s, h, sum = 0.0f; h = (b-a)/NN; // add in the first few terms sum += exp(-a*a/2.0) + 4.0*exp(-(a+h)*(a+h)/2.0); // and the last one sum += exp(-b*b/2.0); vec_cf0 = _mm512_set1_pd(a); vec_cf1 = _mm512_set1_pd(2*h); vec_cf2 = _mm512_set1_pd(-0.5); vec_s = _mm512_set_pd(8,7,6,5,4,3,2,1);//vectorize vec_s = _mm512_mul_pd(vec_s, vec_cf1);//(16h,14h,..,2h) vec_s = _mm512_add_pd(vec_cf0, vec_s);//(a+16h,..,a+2h) vec_stp = _mm512_set1_pd(2*h*vecsize-h); vec_cf0 = _mm512_set1_pd(h); for (int i = 0; i < nCal; ++i){ vec_exp = _mm512_mul_pd(vec_s, vec_s); vec_exp = _mm512_mul_pd(vec_exp, vec_cf2); vec_cf1 = _mm512_exp_pd(vec_exp);//vec_cf1->sum sum += 2.0*_mm512_reduce_add_pd(vec_cf1); vec_s = _mm512_add_pd(vec_s, vec_cf0);//s+=h vec_exp = _mm512_mul_pd(vec_s, vec_s); vec_exp = _mm512_mul_pd(vec_exp, vec_cf2); vec_cf1 = _mm512_exp_pd(vec_exp); sum += 4.0*_mm512_reduce_add_pd(vec_cf1); vec_s = _mm512_add_pd(vec_s, vec_stp); } sum = 0.5*sqrt(2*PI) + h*sum/3.0; return sum; }
void SpMM(Csr<ValueType>* m1, Csr<ValueType>* m2, int num_buckets) { vector<FastHash<int, ValueType>* > result_map(m1->num_rows); for (auto& v : result_map) { v = new FastHash<int, ValueType>(num_buckets); } cout << "Starting SpMM..." << endl; float res = 0; double before = rtclock(); for(int i=0;i<m1->num_rows;i++) { for(int j=m1->rows[i];j<m1->rows[i+1];j++) { int cola = m1->cols[j]; __m512d a = _mm512_set1_pd(m1->vals[j]); for(int k=m2->rows[cola];k<m2->rows[cola] + m2->row_lens[cola];k+=16) { __m512d *pb1 = (__m512d *)(&(m2->vals[k])); __m512d *pb2 = (__m512d *)(&(m2->vals[k]) + 8); __m512i *pcols = (__m512i *)(&(m2->cols[k])); __m512d c1 = _mm512_mul_pd(a, *pb1); __m512d c2 = _mm512_mul_pd(a, *pb2); for(int x=0;x<8;x++) { int col = ((int *)pcols)[x]; if (col == -1) { continue; } ValueType val = ((ValueType *)(&c1))[x]; result_map[i]->Reduce(col, val); res += val; } for (int x = 0; x < 8; ++x) { int col = ((int *)pcols)[x+8]; if (col == -1) { continue; } ValueType val = ((ValueType *)(&c2))[x]; result_map[i]->Reduce(col, val); res += val; } } } } double after = rtclock(); cout << "res: " << res << endl; cout << RED << "[****Result****] ========> *SIMD Naive* time: " << after - before << " secs." << RESET << endl; for (auto& v : result_map) { delete v; } }
int main(int argc, char** argv) { /* variable declaration */ DB(DB_LVL, "declaration"); DT * memA_t0, *memA_t1, *memA_t2, *memA_t3; DT * memB_t0, *memB_t1, *memB_t2, *memB_t3; DT * memO_t0, *memO_t1, *memO_t2, *memO_t3; int reps, size; int samples; int tid; int i, p, r, bytes, elems; int bytes_min, bytes_max; int elems_min, elems_max; double func_overhead; double t_start, t_end; double t_min, c_min; double alpha = 0.5; DB(DB_LVL, SEPERATOR); /* initialization */ DB(DB_LVL, "intialization"); samples = 3; bytes_min = 1024, bytes_max = 1024*32; /* [1KB, 32KB] */ elems_min = bytes_min/sizeof(DT), elems_max = bytes_max/sizeof(DT); /* the number of elements */ reps = 40000; DB(DB_LVL, SEPERATOR); /* omp environment */ const int nthreads = argc > 1 ? atoi(argv[1]) : 4; fprintf(stderr , "nthreads= %d\n", nthreads); omp_set_num_threads(nthreads); /* iteration */ DB(DB_LVL, "measurement"); for(elems=elems_min, bytes=bytes_min; elems<=elems_max; elems=elems+elems_min, bytes=bytes+bytes_min) { memA_t0 = (DT *)_mm_malloc(bytes_max, 64); memA_t1 = (DT *)_mm_malloc(bytes_max, 64); memA_t2 = (DT *)_mm_malloc(bytes_max, 64); memA_t3 = (DT *)_mm_malloc(bytes_max, 64); memB_t0 = (DT *)_mm_malloc(bytes_max, 64); memB_t1 = (DT *)_mm_malloc(bytes_max, 64); memB_t2 = (DT *)_mm_malloc(bytes_max, 64); memB_t3 = (DT *)_mm_malloc(bytes_max, 64); memO_t0 = (DT *)_mm_malloc(bytes_max, 64); memO_t1 = (DT *)_mm_malloc(bytes_max, 64); memO_t2 = (DT *)_mm_malloc(bytes_max, 64); memO_t3 = (DT *)_mm_malloc(bytes_max, 64); /* initialization a local space */ fill(memA_t0, elems, 1.0); fill(memA_t1, elems, 2.0); fill(memA_t2, elems, 3.0); fill(memA_t3, elems, 4.0); fill(memB_t0, elems, 1.0); fill(memB_t1, elems, 2.0); fill(memB_t2, elems, 3.0); fill(memB_t3, elems, 4.0); fill(memO_t0, elems, 1.0); fill(memO_t1, elems, 2.0); fill(memO_t2, elems, 3.0); fill(memO_t3, elems, 4.0); /* measurement */ t_min = 0.0f; c_min = 0.0f; DT ret_t0 = 0.0; DT ret_t1 = 0.0; DT ret_t2 = 0.0; DT ret_t3 = 0.0; #ifdef SAXPY2 #define Z _z #else #define Z _z #endif for(p=0; p<samples; p++) { __m512d *_x, *_y, *_z; #pragma omp parallel private(_x,_y,_z) default(shared) { int tid; tid = omp_get_thread_num(); switch(tid) { case 0: _x = (__m512d*)memA_t0; _y = (__m512d*)memB_t0; _z = (__m512d*)memO_t0; break; case 1: _x = (__m512d*)memA_t1; _y = (__m512d*)memB_t1; _z = (__m512d*)memO_t1; break; case 2: _x = (__m512d*)memA_t2; _y = (__m512d*)memB_t2; _z = (__m512d*)memO_t2; break; case 3: _x = (__m512d*)memA_t3; _y = (__m512d*)memB_t3; _z = (__m512d*)memO_t3; break; default: assert(0); } #pragma omp barrier if(p==(samples-1)) t_start = timer(); int r; for(r=0; r<reps; r++) { asm("#t0-beg"); #if 0 double *memO_t0 = (double*)Z; const double *memA_t0 = (double*)_x; const double *memB_t0 = (double*)_y; #pragma vector aligned for(i=0; i<elems; i=i+1) { //ret_t0 += mem_t0[i]; memO_t0[i] = alpha * memA_t0[i] + memB_t0[i]; } memO_t0[0] = memO_t0[0] * 0.1; // to avoid overflow and optimizations #else const int cnts = elems >> 3; const __m512d _a = _mm512_set1_pd(alpha); int ib; for (ib = 0; ib < cnts; ib += 8*8) { Z[ib+0] = _mm512_add_pd(_y[ib+0], _mm512_mul_pd(_a,_x[ib+0])); Z[ib+1] = _mm512_add_pd(_y[ib+1], _mm512_mul_pd(_a,_x[ib+1])); Z[ib+2] = _mm512_add_pd(_y[ib+2], _mm512_mul_pd(_a,_x[ib+2])); Z[ib+3] = _mm512_add_pd(_y[ib+3], _mm512_mul_pd(_a,_x[ib+3])); Z[ib+4] = _mm512_add_pd(_y[ib+4], _mm512_mul_pd(_a,_x[ib+4])); Z[ib+5] = _mm512_add_pd(_y[ib+5], _mm512_mul_pd(_a,_x[ib+5])); Z[ib+6] = _mm512_add_pd(_y[ib+6], _mm512_mul_pd(_a,_x[ib+6])); Z[ib+7] = _mm512_add_pd(_y[ib+7], _mm512_mul_pd(_a,_x[ib+7])); Z[ib+8+0] = _mm512_add_pd(_y[ib+8+0], _mm512_mul_pd(_a,_x[ib+8+0])); Z[ib+8+1] = _mm512_add_pd(_y[ib+8+1], _mm512_mul_pd(_a,_x[ib+8+1])); Z[ib+8+2] = _mm512_add_pd(_y[ib+8+2], _mm512_mul_pd(_a,_x[ib+8+2])); Z[ib+8+3] = _mm512_add_pd(_y[ib+8+3], _mm512_mul_pd(_a,_x[ib+8+3])); Z[ib+8+4] = _mm512_add_pd(_y[ib+8+4], _mm512_mul_pd(_a,_x[ib+8+4])); Z[ib+8+5] = _mm512_add_pd(_y[ib+8+5], _mm512_mul_pd(_a,_x[ib+8+5])); Z[ib+8+6] = _mm512_add_pd(_y[ib+8+6], _mm512_mul_pd(_a,_x[ib+8+6])); Z[ib+8+7] = _mm512_add_pd(_y[ib+8+7], _mm512_mul_pd(_a,_x[ib+8+7])); Z[ib+16+0] = _mm512_add_pd(_y[ib+16+0], _mm512_mul_pd(_a,_x[ib+16+0])); Z[ib+16+1] = _mm512_add_pd(_y[ib+16+1], _mm512_mul_pd(_a,_x[ib+16+1])); Z[ib+16+2] = _mm512_add_pd(_y[ib+16+2], _mm512_mul_pd(_a,_x[ib+16+2])); Z[ib+16+3] = _mm512_add_pd(_y[ib+16+3], _mm512_mul_pd(_a,_x[ib+16+3])); Z[ib+16+4] = _mm512_add_pd(_y[ib+16+4], _mm512_mul_pd(_a,_x[ib+16+4])); Z[ib+16+5] = _mm512_add_pd(_y[ib+16+5], _mm512_mul_pd(_a,_x[ib+16+5])); Z[ib+16+6] = _mm512_add_pd(_y[ib+16+6], _mm512_mul_pd(_a,_x[ib+16+6])); Z[ib+16+7] = _mm512_add_pd(_y[ib+16+7], _mm512_mul_pd(_a,_x[ib+16+7])); Z[ib+24+0] = _mm512_add_pd(_y[ib+24+0], _mm512_mul_pd(_a,_x[ib+24+0])); Z[ib+24+1] = _mm512_add_pd(_y[ib+24+1], _mm512_mul_pd(_a,_x[ib+24+1])); Z[ib+24+2] = _mm512_add_pd(_y[ib+24+2], _mm512_mul_pd(_a,_x[ib+24+2])); Z[ib+24+3] = _mm512_add_pd(_y[ib+24+3], _mm512_mul_pd(_a,_x[ib+24+3])); Z[ib+24+4] = _mm512_add_pd(_y[ib+24+4], _mm512_mul_pd(_a,_x[ib+24+4])); Z[ib+24+5] = _mm512_add_pd(_y[ib+24+5], _mm512_mul_pd(_a,_x[ib+24+5])); Z[ib+24+6] = _mm512_add_pd(_y[ib+24+6], _mm512_mul_pd(_a,_x[ib+24+6])); Z[ib+24+7] = _mm512_add_pd(_y[ib+24+7], _mm512_mul_pd(_a,_x[ib+24+7])); Z[ib+32+0] = _mm512_add_pd(_y[ib+32+0], _mm512_mul_pd(_a,_x[ib+32+0])); Z[ib+32+1] = _mm512_add_pd(_y[ib+32+1], _mm512_mul_pd(_a,_x[ib+32+1])); Z[ib+32+2] = _mm512_add_pd(_y[ib+32+2], _mm512_mul_pd(_a,_x[ib+32+2])); Z[ib+32+3] = _mm512_add_pd(_y[ib+32+3], _mm512_mul_pd(_a,_x[ib+32+3])); Z[ib+32+4] = _mm512_add_pd(_y[ib+32+4], _mm512_mul_pd(_a,_x[ib+32+4])); Z[ib+32+5] = _mm512_add_pd(_y[ib+32+5], _mm512_mul_pd(_a,_x[ib+32+5])); Z[ib+32+6] = _mm512_add_pd(_y[ib+32+6], _mm512_mul_pd(_a,_x[ib+32+6])); Z[ib+32+7] = _mm512_add_pd(_y[ib+32+7], _mm512_mul_pd(_a,_x[ib+32+7])); Z[ib+40+0] = _mm512_add_pd(_y[ib+40+0], _mm512_mul_pd(_a,_x[ib+40+0])); Z[ib+40+1] = _mm512_add_pd(_y[ib+40+1], _mm512_mul_pd(_a,_x[ib+40+1])); Z[ib+40+2] = _mm512_add_pd(_y[ib+40+2], _mm512_mul_pd(_a,_x[ib+40+2])); Z[ib+40+3] = _mm512_add_pd(_y[ib+40+3], _mm512_mul_pd(_a,_x[ib+40+3])); Z[ib+40+4] = _mm512_add_pd(_y[ib+40+4], _mm512_mul_pd(_a,_x[ib+40+4])); Z[ib+40+5] = _mm512_add_pd(_y[ib+40+5], _mm512_mul_pd(_a,_x[ib+40+5])); Z[ib+40+6] = _mm512_add_pd(_y[ib+40+6], _mm512_mul_pd(_a,_x[ib+40+6])); Z[ib+40+7] = _mm512_add_pd(_y[ib+40+7], _mm512_mul_pd(_a,_x[ib+40+7])); Z[ib+48+0] = _mm512_add_pd(_y[ib+48+0], _mm512_mul_pd(_a,_x[ib+48+0])); Z[ib+48+1] = _mm512_add_pd(_y[ib+48+1], _mm512_mul_pd(_a,_x[ib+48+1])); Z[ib+48+2] = _mm512_add_pd(_y[ib+48+2], _mm512_mul_pd(_a,_x[ib+48+2])); Z[ib+48+3] = _mm512_add_pd(_y[ib+48+3], _mm512_mul_pd(_a,_x[ib+48+3])); Z[ib+48+4] = _mm512_add_pd(_y[ib+48+4], _mm512_mul_pd(_a,_x[ib+48+4])); Z[ib+48+5] = _mm512_add_pd(_y[ib+48+5], _mm512_mul_pd(_a,_x[ib+48+5])); Z[ib+48+6] = _mm512_add_pd(_y[ib+48+6], _mm512_mul_pd(_a,_x[ib+48+6])); Z[ib+48+7] = _mm512_add_pd(_y[ib+48+7], _mm512_mul_pd(_a,_x[ib+48+7])); Z[ib+56+0] = _mm512_add_pd(_y[ib+56+0], _mm512_mul_pd(_a,_x[ib+56+0])); Z[ib+56+1] = _mm512_add_pd(_y[ib+56+1], _mm512_mul_pd(_a,_x[ib+56+1])); Z[ib+56+2] = _mm512_add_pd(_y[ib+56+2], _mm512_mul_pd(_a,_x[ib+56+2])); Z[ib+56+3] = _mm512_add_pd(_y[ib+56+3], _mm512_mul_pd(_a,_x[ib+56+3])); Z[ib+56+4] = _mm512_add_pd(_y[ib+56+4], _mm512_mul_pd(_a,_x[ib+56+4])); Z[ib+56+5] = _mm512_add_pd(_y[ib+56+5], _mm512_mul_pd(_a,_x[ib+56+5])); Z[ib+56+6] = _mm512_add_pd(_y[ib+56+6], _mm512_mul_pd(_a,_x[ib+56+6])); Z[ib+56+7] = _mm512_add_pd(_y[ib+56+7], _mm512_mul_pd(_a,_x[ib+56+7])); } #endif asm("#t0-end"); } } if(p==(samples-1)) t_end = timer(); } t_min = (t_end - t_start)/reps; printf("%lf,%lf,%lf,%lf\n", ret_t0, ret_t1, ret_t2, ret_t3); SAVE_DATA("%lf\t", 3*nthreads*bytes/t_min); printf("cbw: %lf\t elems= %d mem_tot= %d\n", 3*nthreads*bytes/t_min, elems/8, 3*elems*sizeof(DT)*nthreads); if(memA_t0!=NULL) _mm_free(memA_t0); if(memA_t1!=NULL) _mm_free(memA_t1); if(memA_t2!=NULL) _mm_free(memA_t2); if(memA_t3!=NULL) _mm_free(memA_t3); if(memB_t0!=NULL) _mm_free(memB_t0); if(memB_t1!=NULL) _mm_free(memB_t1); if(memB_t2!=NULL) _mm_free(memB_t2); if(memB_t3!=NULL) _mm_free(memB_t3); if(memO_t0!=NULL) _mm_free(memO_t0); if(memO_t1!=NULL) _mm_free(memO_t1); if(memO_t2!=NULL) _mm_free(memO_t2); if(memO_t3!=NULL) _mm_free(memO_t3); } DB(DB_LVL, SEPERATOR); /* post-process */ DB(DB_LVL, "post-process"); DB(DB_LVL, SEPERATOR); }
inline short_vec(const double data = 0) : val{_mm512_set1_pd(data), _mm512_set1_pd(data)} {}
, double complex const E[restrict PNLx][PNLy][PNLz] , double complex F[restrict PNLx][PNLy][PNLz] ) { const double A = *A_; double const* b; double complex const* e; double complex * f; int ix, iy, iz, n; #ifdef ARTED_STENCIL_LOOP_BLOCKING int bx, by; #endif __m512d at = _mm512_set1_pd(A); __m512d HALF = _mm512_set1_pd(-0.5); #ifdef TUNING_COMPLEX_MUL __m512i INV = _mm512_set4_epi64(1LL << 63, 0, 1LL << 63, 0); #else __m512d ZI = _mm512_set_pd(-1, 0, -1, 0, -1, 0, -1, 0); #endif __declspec(align(64)) double G[12]; for(n = 0 ; n < 12 ; ++n) G[n] = C[n] * -0.5; __m512i nly = _mm512_set1_epi32(PNLy); __m512i nlz = _mm512_set1_epi32(PNLz); #ifdef ARTED_DOMAIN_POWER_OF_TWO __m512i myx = _mm512_mask_blend_epi32(0xFF00, _mm512_set1_epi32(NLy - 1), _mm512_set1_epi32(NLx - 1));
void newviewGTRGAMMA_MIC(int tipCase, double *x1, double *x2, double *x3, double *extEV, double *tipVector, int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling) { __m512d minlikelihood_MIC = _mm512_set1_pd(PLL_MINLIKELIHOOD); __m512d twotothe256_MIC = _mm512_set1_pd(PLL_TWOTOTHE256); __m512i absMask_MIC = _mm512_set1_epi64(0x7fffffffffffffffULL); int addScale = 0; double aEV[64] __attribute__((align(PLL_BYTE_ALIGNMENT))); #pragma ivdep for (int l = 0; l < 64; ++l) { aEV[l] = extEV[(l / 16) * 4 + (l % 4)]; } switch(tipCase) { case PLL_TIP_TIP: { /* multiply all possible tip state vectors with the respective P-matrices */ double umpX1[256] __attribute__((align(PLL_BYTE_ALIGNMENT))); double umpX2[256] __attribute__((align(PLL_BYTE_ALIGNMENT))); for(int k = 0; k < 256; ++k) { umpX1[k] = 0.0; umpX2[k] = 0.0; } for(int i = 0; i < maxStateValue; ++i) { for(int l = 0; l < states; ++l) { #pragma ivdep for(int k = 0; k < span; ++k) { umpX1[16 * i + k] += tipVector[i * 4 + l] * left[k * 4 + l]; umpX2[16 * i + k] += tipVector[i * 4 + l] * right[k * 4 + l]; } } } double auX[64] __attribute__((align(64))); for(int i = 0; i < n; ++i) { _mm_prefetch((const char*) (const char*) &x3[span*(i+8)], _MM_HINT_ET1); _mm_prefetch((const char*) &x3[span*(i+8) + 8], _MM_HINT_ET1); _mm_prefetch((const char*) &x3[span*(i+1)], _MM_HINT_ET0); _mm_prefetch((const char*) &x3[span*(i+1) + 8], _MM_HINT_ET0); const double *uX1 = &umpX1[16 * tipX1[i]]; const double *uX2 = &umpX2[16 * tipX2[i]]; double uX[16] __attribute__((align(PLL_BYTE_ALIGNMENT))); double* v = &x3[i * 16]; #pragma ivdep #pragma vector aligned for(int l = 0; l < 16; ++l) { uX[l] = uX1[l] * uX2[l]; v[l] = 0.; } mic_broadcast16x64(uX, auX); for (int j = 0; j < 4; ++j) { #pragma ivdep #pragma vector aligned #pragma vector nontemporal for(int k = 0; k < 16; ++k) { v[k] += auX[j*16 + k] * aEV[j*16 + k]; } } // init scaling counter for the site if (!fastScaling) ex3[i] = 0; } // sites loop } break; case PLL_TIP_INNER: { /* we do analogous pre-computations as above, with the only difference that we now do them only for one tip vector */ double umpX1[256] __attribute__((align(PLL_BYTE_ALIGNMENT))); /* precompute P and left tip vector product */ for(int k = 0; k < 256; ++k) { umpX1[k] = 0.0; } for(int i = 0; i < 16; ++i) { for(int l = 0; l < 4; ++l) { #pragma ivdep for(int k = 0; k < 16; ++k) { umpX1[16 * i + k] += tipVector[i * 4 + l] * left[k * 4 + l]; } } } // re-arrange right matrix for better memory layout double aRight[64] __attribute__((align(PLL_BYTE_ALIGNMENT))); for(int j = 0; j < 4; j++) { for(int l = 0; l < 16; l++) { aRight[j*16 + l] = right[l*4 + j]; } } for (int i = 0; i < n; i++) { _mm_prefetch((const char*) &x2[span*(i+16)], _MM_HINT_T1); _mm_prefetch((const char*) &x2[span*(i+16) + 8], _MM_HINT_T1); _mm_prefetch((const char*) &x3[span*(i+16)], _MM_HINT_ET1); _mm_prefetch((const char*) &x3[span*(i+16) + 8], _MM_HINT_ET1); _mm_prefetch((const char*) &x2[span*(i+1)], _MM_HINT_T0); _mm_prefetch((const char*) &x2[span*(i+1) + 8], _MM_HINT_T0); _mm_prefetch((const char*) &x3[span*(i+1)], _MM_HINT_ET0); _mm_prefetch((const char*) &x3[span*(i+1) + 8], _MM_HINT_ET0); /* access pre-computed value based on the raw sequence data tipX1 that is used as an index */ double* uX1 = &umpX1[span * tipX1[i]]; double uX2[16] __attribute__((align(PLL_BYTE_ALIGNMENT))); double uX[16] __attribute__((align(PLL_BYTE_ALIGNMENT))); #pragma vector aligned for(int l = 0; l < 16; ++l) { uX2[l] = 0.; } double aV2[64] __attribute__((align(PLL_BYTE_ALIGNMENT))); const double* v2 = &(x2[16 * i]); mic_broadcast16x64(v2, aV2); for(int j = 0; j < 4; j++) { #pragma ivdep #pragma vector aligned for(int l = 0; l < 16; l++) { uX2[l] += aV2[j*16 + l] * aRight[j*16 + l]; } } double* v3 = &(x3[span * i]); #pragma ivdep #pragma vector aligned for(int l = 0; l < 16; ++l) { uX[l] = uX1[l] * uX2[l]; v3[l] = 0.; } double auX[64] __attribute__((align(PLL_BYTE_ALIGNMENT))); mic_broadcast16x64(uX, auX); for (int j = 0; j < 4; ++j) { #pragma ivdep #pragma vector aligned for(int k = 0; k < 16; ++k) { v3[k] += auX[j*16 + k] * aEV[j*16 + k]; } } __m512d t1 = _mm512_load_pd(&v3[0]); t1 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t1), absMask_MIC)); double vmax1 = _mm512_reduce_gmax_pd(t1); __m512d t2 = _mm512_load_pd(&v3[8]); t2 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t2), absMask_MIC)); double vmax2 = _mm512_reduce_gmax_pd(t2); if(vmax1 < PLL_MINLIKELIHOOD && vmax2 < PLL_MINLIKELIHOOD) { t1 = _mm512_mul_pd(t1, twotothe256_MIC); _mm512_store_pd(&v3[0], t1); t2 = _mm512_mul_pd(t2, twotothe256_MIC); _mm512_store_pd(&v3[8], t2); if(!fastScaling) ex3[i] += 1; else addScale += wgt[i]; } } // site loop } break; case PLL_INNER_INNER: { /* same as above, without pre-computations */ // re-arrange right matrix for better memory layout double aLeft[64] __attribute__((align(PLL_BYTE_ALIGNMENT))); double aRight[64] __attribute__((align(PLL_BYTE_ALIGNMENT))); for(int j = 0; j < 4; j++) { for(int l = 0; l < 16; l++) { aLeft[j*16 + l] = left[l*4 + j]; aRight[j*16 + l] = right[l*4 + j]; } } for (int i = 0; i < n; i++) { _mm_prefetch((const char*) &x1[span*(i+8)], _MM_HINT_T1); _mm_prefetch((const char*) &x1[span*(i+8) + 8], _MM_HINT_T1); _mm_prefetch((const char*) &x2[span*(i+8)], _MM_HINT_T1); _mm_prefetch((const char*) &x2[span*(i+8) + 8], _MM_HINT_T1); _mm_prefetch((const char*) &x3[span*(i+8)], _MM_HINT_ET1); _mm_prefetch((const char*) &x3[span*(i+8) + 8], _MM_HINT_ET1); _mm_prefetch((const char*) &x1[span*(i+1)], _MM_HINT_T0); _mm_prefetch((const char*) &x1[span*(i+1) + 8], _MM_HINT_T0); _mm_prefetch((const char*) &x2[span*(i+1)], _MM_HINT_T0); _mm_prefetch((const char*) &x2[span*(i+1) + 8], _MM_HINT_T0); _mm_prefetch((const char*) &x3[span*(i+1)], _MM_HINT_ET0); _mm_prefetch((const char*) &x3[span*(i+1) + 8], _MM_HINT_ET0); double uX1[16] __attribute__((align(64))); double uX2[16] __attribute__((align(64))); double uX[16] __attribute__((align(64))); for(int l = 0; l < 16; l++) { uX1[l] = 0.; uX2[l] = 0.; } double aV1[64] __attribute__((align(64))); double aV2[64] __attribute__((align(64))); const double* v1 = &(x1[span * i]); const double* v2 = &(x2[span * i]); mic_broadcast16x64(v1, aV1); mic_broadcast16x64(v2, aV2); for(int j = 0; j < 4; j++) { #pragma ivdep #pragma vector aligned for(int l = 0; l < 16; l++) { uX1[l] += aV1[j*16 + l] * aLeft[j*16 + l]; uX2[l] += aV2[j*16 + l] * aRight[j*16 + l]; } } double* v3 = &(x3[span * i]); #pragma ivdep #pragma vector aligned for(int l = 0; l < 16; ++l) { uX[l] = uX1[l] * uX2[l]; v3[l] = 0.; } double auX[64] __attribute__((align(64))); mic_broadcast16x64(uX, auX); for(int j = 0; j < 4; ++j) { #pragma ivdep #pragma vector aligned for(int k = 0; k < 16; ++k) { v3[k] += auX[j*16 + k] * aEV[j*16 + k]; } } __m512d t1 = _mm512_load_pd(&v3[0]); t1 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t1), absMask_MIC)); double vmax1 = _mm512_reduce_gmax_pd(t1); __m512d t2 = _mm512_load_pd(&v3[8]); t2 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t2), absMask_MIC)); double vmax2 = _mm512_reduce_gmax_pd(t2); if(vmax1 < PLL_MINLIKELIHOOD && vmax2 < PLL_MINLIKELIHOOD) { t1 = _mm512_mul_pd(t1, twotothe256_MIC); _mm512_store_pd(&v3[0], t1); t2 = _mm512_mul_pd(t2, twotothe256_MIC); _mm512_store_pd(&v3[8], t2); if(!fastScaling) ex3[i] += 1; else addScale += wgt[i]; } } } break; default: // assert(0); break; } /* as above, increment the global counter that counts scaling multiplications by the scaling multiplications carried out for computing the likelihood array at node p */ if (fastScaling) { *scalerIncrement = addScale; } }
inline short_vec(const double data = 0) : val(_mm512_set1_pd(data)) {}