void printcounters(struct counter *ctrs, uint64_t duration) { struct metrics s = {0}; s.timestamp = _rdtsc(); s.duration = duration; // We skip the last core int corethreads =0; for (int cpu = 1; cpu < gbl.ncpus-3; ++cpu) { double delta[NEVENTS]; // volatile because another thread is changing it. volatile struct counter *p = &ctrs[cpu]; for (int i = 0; i < NEVENTS; ++i) { union { __m512d c; uint64_t values[8]; } t; t.c = _mm512_load_pd((void *)&p->counts[i][0]); delta[i] = perf_scale_delta(t.values, lastctr[cpu].counts[i]); _mm512_storenrngo_pd((void *)&lastctr[cpu].counts[i][0], t.c); if (delta[i] < 0) delta[i] = 0; sevents[i] += delta[i]; } if (2*delta[clocks1] > duration) { s.nthreads += 1; corethreads += 1; } if ((cpu % 4) == 0) // Last thread on this core { if (corethreads) s.ncores += 1; corethreads = 0; } s.vpu_ea += delta[vpu_ea]; s.instrs += delta[instrs]; s.vinstrs += delta[vpu_ie]; } uint64_t nreads = 0, nwrites = 0; for (int i = 0; i < NGBOXES; ++i) for (int j = 0; j < 2; ++j) { nreads += pmu_rdctr(i, j, 0); nwrites += pmu_rdctr(i, j, 1); } s.rbytes = (nreads - prevnreads) * 64; s.wbytes = (nwrites - prevnwrites)* 64; prevnreads = nreads; prevnwrites = nwrites; sample(&s); }
int main() { __m512d t0,t1; double d1[8] __attribute__ ((aligned(64))); double d2[8] __attribute__ ((aligned(64))); double d3[8] __attribute__ ((aligned(64))); for(int i=0; i<8; i++) { d1[i]= i*1.0; d2[i]= 0.0; d3[i] = d1[i]; } //printf("testing intialization of registers\n"); //_mm512_store_pd(d1,t0); //printf("d1=t0: %f %f %f %f %f %f %f %f \n",d1[0],d1[1],d1[2],d1[3],d1[4],d1[5],d1[6],d1[7]); //_mm512_store_pd(d1,t1); //printf("d1=t1: %f %f %f %f %f %f %f %f \n",d1[0],d1[1],d1[2],d1[3],d1[4],d1[5],d1[6],d1[7]); t0 = _mm512_load_pd(d1); printf("permute backward\n"); t1 = (__m512d) _mm512_permute4f128_epi32 ( (__m512i) t0, 0b00111001); _mm512_store_pd(d2,t1); printf("d1: %f %f %f %f %f %f %f %f \n",d1[0],d1[1],d1[2],d1[3],d1[4],d1[5],d1[6],d1[7]); printf("d2: %f %f %f %f %f %f %f %f \n",d2[0],d2[1],d2[2],d2[3],d2[4],d2[5],d2[6],d2[7]); printf("permute forward\n"); t1 = (__m512d) _mm512_permute4f128_epi32 ( (__m512i) t0, 0b10010011); _mm512_store_pd(d2,t1); printf("d1: %f %f %f %f %f %f %f %f \n",d1[0],d1[1],d1[2],d1[3],d1[4],d1[5],d1[6],d1[7]); printf("d2: %f %f %f %f %f %f %f %f \n",d2[0],d2[1],d2[2],d2[3],d2[4],d2[5],d2[6],d2[7]); int __attribute__((aligned(64))) order[16]={0,1,0,1,4,5,6,7,8,9,10,11,12,13,14,15}; __m512i morder = _mm512_load_epi32(order); printf("permuting doubles\n"); t1 = (__m512d) _mm512_permutevar_epi32 (morder, (__m512i) t0); _mm512_store_pd(d2,t1); printf("d1: %f %f %f %f %f %f %f %f \n",d1[0],d1[1],d1[2],d1[3],d1[4],d1[5],d1[6],d1[7]); printf("d2: %f %f %f %f %f %f %f %f \n",d2[0],d2[1],d2[2],d2[3],d2[4],d2[5],d2[6],d2[7]); return 0; }
inline void transfer_omp_loop_nontemp(uintptr_t rbuf, uintptr_t sbuf, size_t size, HMPI_Request recv_req, HMPI_Request send_req){ int N_DOUBLES_PER_BLOCK = (64/sizeof(char)) ; size_t total = size / 64 ; int i = 0; //#pragma vector nontemporal #pragma omp parallel for for (i = 0; i < total; i++) { __m512d v_b = _mm512_load_pd(sbuf+ N_DOUBLES_PER_BLOCK*i); _mm512_storenrngo_pd(rbuf+ N_DOUBLES_PER_BLOCK*i, v_b); } }
void extern avx512f_test (void) { x1 = _mm512_mask_mov_pd (x1, m, x2); x1 = _mm512_maskz_mov_pd (m, x2); x1 = _mm512_load_pd (p); x1 = _mm512_mask_load_pd (x1, m, p); x1 = _mm512_maskz_load_pd (m, p); _mm512_store_pd (p, x1); _mm512_mask_store_pd (p, m, x1); }
void ks_rank_k_int_d16x14( int k, double *a, double *b, double *c, int ldc, aux_t *aux ) { int i; double neg2 = -2.0; v8df_t c007_0, c007_1, c007_2, c007_3, c007_4; v8df_t c007_5, c007_6, c007_7, c007_8, c007_9; v8df_t c007_10, c007_11, c007_12, c007_13; v8df_t c815_0, c815_1, c815_2, c815_3, c815_4; v8df_t c815_5, c815_6, c815_7, c815_8, c815_9; v8df_t c815_10, c815_11, c815_12, c815_13; v8df_t a007, a815, b_tmp; int k_iter = k; // TODO: need to clean the c buffer. for ( i = 0; i < k_iter; ++ i ) { a007.v = _mm512_load_pd( a ); a815.v = _mm512_load_pd( a + 8 ); //printf( "%lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf\n", // a007.d[ 0 ], a007.d[ 1 ], a007.d[ 2 ], a007.d[ 3 ], // a007.d[ 4 ], a007.d[ 5 ], a007.d[ 6 ], a007.d[ 7 ] ); b_tmp.v = _mm512_extload_pd( b, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); //printf( "b[ 0 ] = %lf\n", b[ 0 ] ); //printf( "%lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf\n", // b_tmp.d[ 0 ], b_tmp.d[ 1 ], b_tmp.d[ 2 ], b_tmp.d[ 3 ], // b_tmp.d[ 4 ], b_tmp.d[ 5 ], b_tmp.d[ 6 ], b_tmp.d[ 7 ] ); c007_0.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_0.v ); c815_0.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_0.v ); b_tmp.v = _mm512_extload_pd( b + 1, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_1.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_1.v ); c815_1.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_1.v ); b_tmp.v = _mm512_extload_pd( b + 2, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_2.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_2.v ); c815_2.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_2.v ); b_tmp.v = _mm512_extload_pd( b + 3, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_3.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_3.v ); c815_3.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_3.v ); b_tmp.v = _mm512_extload_pd( b + 4, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_4.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_4.v ); c815_4.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_4.v ); b_tmp.v = _mm512_extload_pd( b + 5, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_5.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_5.v ); c815_5.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_5.v ); b_tmp.v = _mm512_extload_pd( b + 6, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_6.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_6.v ); c815_6.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_6.v ); b_tmp.v = _mm512_extload_pd( b + 7, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_7.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_7.v ); c815_7.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_7.v ); b_tmp.v = _mm512_extload_pd( b + 8, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_8.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_8.v ); c815_8.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_8.v ); b_tmp.v = _mm512_extload_pd( b + 9, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_9.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_9.v ); c815_9.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_9.v ); b_tmp.v = _mm512_extload_pd( b + 10, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_10.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_10.v ); c815_10.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_10.v ); b_tmp.v = _mm512_extload_pd( b + 11, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_11.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_11.v ); c815_11.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_11.v ); b_tmp.v = _mm512_extload_pd( b + 12, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_12.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_12.v ); c815_12.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_12.v ); b_tmp.v = _mm512_extload_pd( b + 13, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_13.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_13.v ); c815_13.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_13.v ); a += 16; b += 16; } // simulate kernel summation c007_0.v = _mm512_add_pd( c007_0.v, c007_1.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_1.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_2.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_2.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_3.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_3.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_4.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_4.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_5.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_5.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_6.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_6.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_7.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_7.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_8.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_8.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_9.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_9.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_10.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_10.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_11.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_11.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_12.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_12.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_13.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_13.v ); // if ( aux->pc != 0 ) { // // // packed // tmpc03_0.v = _mm256_load_pd( (double*)( c ) ); // tmpc47_0.v = _mm256_load_pd( (double*)( c + 4 ) ); // // tmpc03_1.v = _mm256_load_pd( (double*)( c + 8 ) ); // tmpc47_1.v = _mm256_load_pd( (double*)( c + 12 ) ); // // tmpc03_2.v = _mm256_load_pd( (double*)( c + 16 ) ); // tmpc47_2.v = _mm256_load_pd( (double*)( c + 20 ) ); // // tmpc03_3.v = _mm256_load_pd( (double*)( c + 24 ) ); // tmpc47_3.v = _mm256_load_pd( (double*)( c + 28 ) ); // // // c03_0.v = _mm256_add_pd( tmpc03_0.v, c03_0.v ); // c47_0.v = _mm256_add_pd( tmpc47_0.v, c47_0.v ); // // c03_1.v = _mm256_add_pd( tmpc03_1.v, c03_1.v ); // c47_1.v = _mm256_add_pd( tmpc47_1.v, c47_1.v ); // // c03_2.v = _mm256_add_pd( tmpc03_2.v, c03_2.v ); // c47_2.v = _mm256_add_pd( tmpc47_2.v, c47_2.v ); // // c03_3.v = _mm256_add_pd( tmpc03_3.v, c03_3.v ); // c47_3.v = _mm256_add_pd( tmpc47_3.v, c47_3.v ); // } // // // packed _mm512_store_pd( c , c007_0.v ); _mm512_store_pd( c + 8 , c815_0.v ); // _mm512_store_pd( c + 16, c007_1.v ); // _mm512_store_pd( c + 24, c815_1.v ); // // _mm512_store_pd( c + 32, c007_2.v ); // _mm512_store_pd( c + 40, c815_2.v ); // // _mm512_store_pd( c + 48, c007_3.v ); // _mm512_store_pd( c + 56, c815_3.v ); // // _mm512_store_pd( c + 64, c007_4.v ); // _mm512_store_pd( c + 72, c815_4.v ); // // _mm512_store_pd( c + 80, c007_5.v ); // _mm512_store_pd( c + 88, c815_5.v ); // // _mm512_store_pd( c + 96, c007_6.v ); // _mm512_store_pd( c + 104, c815_6.v ); // // _mm512_store_pd( c + 112, c007_7.v ); // _mm512_store_pd( c + 120, c815_7.v ); // // _mm512_store_pd( c + 128, c007_8.v ); // _mm512_store_pd( c + 136, c815_8.v ); // // _mm512_store_pd( c + 144, c007_9.v ); // _mm512_store_pd( c + 152, c815_9.v ); // // _mm512_store_pd( c + 160, c007_10.v ); // _mm512_store_pd( c + 168, c815_10.v ); // // _mm512_store_pd( c + 176, c007_11.v ); // _mm512_store_pd( c + 184, c815_11.v ); // // _mm512_store_pd( c + 192, c007_12.v ); // _mm512_store_pd( c + 200, c815_12.v ); // // _mm512_store_pd( c + 208, c007_13.v ); // _mm512_store_pd( c + 216, c815_13.v ); //printf( "ldc = %d\n", ldc ); // printf( "%lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf\n", // c007_0.d[ 0 ], c007_0.d[ 1 ], c007_0.d[ 2 ], c007_0.d[ 3 ], // c007_0.d[ 4 ], c007_0.d[ 5 ], c007_0.d[ 6 ], c007_0.d[ 7 ] ); // //printf( "%lf, %lf, %lf, %lf\n", c[1], c[ ldc + 1], c[ ldc * 2 + 1], c[ ldc * 3 + 1] ); //printf( "%lf, %lf, %lf, %lf\n", c[2], c[ ldc + 2], c[ ldc * 2 + 2], c[ ldc * 3 + 2] ); //printf( "%lf, %lf, %lf, %lf\n", c[3], c[ ldc + 3], c[ ldc * 2 + 3], c[ ldc * 3 + 3] ); //printf( "%lf, %lf, %lf, %lf\n", c[4], c[ ldc + 4], c[ ldc * 2 + 4], c[ ldc * 3 + 4] ); //printf( "%lf, %lf, %lf, %lf\n", c[5], c[ ldc + 5], c[ ldc * 2 + 5], c[ ldc * 3 + 5] ); //printf( "%lf, %lf, %lf, %lf\n", c[6], c[ ldc + 6], c[ ldc * 2 + 6], c[ ldc * 3 + 6] ); //printf( "%lf, %lf, %lf, %lf\n", c[7], c[ ldc + 7], c[ ldc * 2 + 7], c[ ldc * 3 + 7] ); }
void newviewGTRGAMMA_MIC(int tipCase, double *x1, double *x2, double *x3, double *extEV, double *tipVector, int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling) { __m512d minlikelihood_MIC = _mm512_set1_pd(PLL_MINLIKELIHOOD); __m512d twotothe256_MIC = _mm512_set1_pd(PLL_TWOTOTHE256); __m512i absMask_MIC = _mm512_set1_epi64(0x7fffffffffffffffULL); int addScale = 0; double aEV[64] __attribute__((align(PLL_BYTE_ALIGNMENT))); #pragma ivdep for (int l = 0; l < 64; ++l) { aEV[l] = extEV[(l / 16) * 4 + (l % 4)]; } switch(tipCase) { case PLL_TIP_TIP: { /* multiply all possible tip state vectors with the respective P-matrices */ double umpX1[256] __attribute__((align(PLL_BYTE_ALIGNMENT))); double umpX2[256] __attribute__((align(PLL_BYTE_ALIGNMENT))); for(int k = 0; k < 256; ++k) { umpX1[k] = 0.0; umpX2[k] = 0.0; } for(int i = 0; i < maxStateValue; ++i) { for(int l = 0; l < states; ++l) { #pragma ivdep for(int k = 0; k < span; ++k) { umpX1[16 * i + k] += tipVector[i * 4 + l] * left[k * 4 + l]; umpX2[16 * i + k] += tipVector[i * 4 + l] * right[k * 4 + l]; } } } double auX[64] __attribute__((align(64))); for(int i = 0; i < n; ++i) { _mm_prefetch((const char*) (const char*) &x3[span*(i+8)], _MM_HINT_ET1); _mm_prefetch((const char*) &x3[span*(i+8) + 8], _MM_HINT_ET1); _mm_prefetch((const char*) &x3[span*(i+1)], _MM_HINT_ET0); _mm_prefetch((const char*) &x3[span*(i+1) + 8], _MM_HINT_ET0); const double *uX1 = &umpX1[16 * tipX1[i]]; const double *uX2 = &umpX2[16 * tipX2[i]]; double uX[16] __attribute__((align(PLL_BYTE_ALIGNMENT))); double* v = &x3[i * 16]; #pragma ivdep #pragma vector aligned for(int l = 0; l < 16; ++l) { uX[l] = uX1[l] * uX2[l]; v[l] = 0.; } mic_broadcast16x64(uX, auX); for (int j = 0; j < 4; ++j) { #pragma ivdep #pragma vector aligned #pragma vector nontemporal for(int k = 0; k < 16; ++k) { v[k] += auX[j*16 + k] * aEV[j*16 + k]; } } // init scaling counter for the site if (!fastScaling) ex3[i] = 0; } // sites loop } break; case PLL_TIP_INNER: { /* we do analogous pre-computations as above, with the only difference that we now do them only for one tip vector */ double umpX1[256] __attribute__((align(PLL_BYTE_ALIGNMENT))); /* precompute P and left tip vector product */ for(int k = 0; k < 256; ++k) { umpX1[k] = 0.0; } for(int i = 0; i < 16; ++i) { for(int l = 0; l < 4; ++l) { #pragma ivdep for(int k = 0; k < 16; ++k) { umpX1[16 * i + k] += tipVector[i * 4 + l] * left[k * 4 + l]; } } } // re-arrange right matrix for better memory layout double aRight[64] __attribute__((align(PLL_BYTE_ALIGNMENT))); for(int j = 0; j < 4; j++) { for(int l = 0; l < 16; l++) { aRight[j*16 + l] = right[l*4 + j]; } } for (int i = 0; i < n; i++) { _mm_prefetch((const char*) &x2[span*(i+16)], _MM_HINT_T1); _mm_prefetch((const char*) &x2[span*(i+16) + 8], _MM_HINT_T1); _mm_prefetch((const char*) &x3[span*(i+16)], _MM_HINT_ET1); _mm_prefetch((const char*) &x3[span*(i+16) + 8], _MM_HINT_ET1); _mm_prefetch((const char*) &x2[span*(i+1)], _MM_HINT_T0); _mm_prefetch((const char*) &x2[span*(i+1) + 8], _MM_HINT_T0); _mm_prefetch((const char*) &x3[span*(i+1)], _MM_HINT_ET0); _mm_prefetch((const char*) &x3[span*(i+1) + 8], _MM_HINT_ET0); /* access pre-computed value based on the raw sequence data tipX1 that is used as an index */ double* uX1 = &umpX1[span * tipX1[i]]; double uX2[16] __attribute__((align(PLL_BYTE_ALIGNMENT))); double uX[16] __attribute__((align(PLL_BYTE_ALIGNMENT))); #pragma vector aligned for(int l = 0; l < 16; ++l) { uX2[l] = 0.; } double aV2[64] __attribute__((align(PLL_BYTE_ALIGNMENT))); const double* v2 = &(x2[16 * i]); mic_broadcast16x64(v2, aV2); for(int j = 0; j < 4; j++) { #pragma ivdep #pragma vector aligned for(int l = 0; l < 16; l++) { uX2[l] += aV2[j*16 + l] * aRight[j*16 + l]; } } double* v3 = &(x3[span * i]); #pragma ivdep #pragma vector aligned for(int l = 0; l < 16; ++l) { uX[l] = uX1[l] * uX2[l]; v3[l] = 0.; } double auX[64] __attribute__((align(PLL_BYTE_ALIGNMENT))); mic_broadcast16x64(uX, auX); for (int j = 0; j < 4; ++j) { #pragma ivdep #pragma vector aligned for(int k = 0; k < 16; ++k) { v3[k] += auX[j*16 + k] * aEV[j*16 + k]; } } __m512d t1 = _mm512_load_pd(&v3[0]); t1 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t1), absMask_MIC)); double vmax1 = _mm512_reduce_gmax_pd(t1); __m512d t2 = _mm512_load_pd(&v3[8]); t2 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t2), absMask_MIC)); double vmax2 = _mm512_reduce_gmax_pd(t2); if(vmax1 < PLL_MINLIKELIHOOD && vmax2 < PLL_MINLIKELIHOOD) { t1 = _mm512_mul_pd(t1, twotothe256_MIC); _mm512_store_pd(&v3[0], t1); t2 = _mm512_mul_pd(t2, twotothe256_MIC); _mm512_store_pd(&v3[8], t2); if(!fastScaling) ex3[i] += 1; else addScale += wgt[i]; } } // site loop } break; case PLL_INNER_INNER: { /* same as above, without pre-computations */ // re-arrange right matrix for better memory layout double aLeft[64] __attribute__((align(PLL_BYTE_ALIGNMENT))); double aRight[64] __attribute__((align(PLL_BYTE_ALIGNMENT))); for(int j = 0; j < 4; j++) { for(int l = 0; l < 16; l++) { aLeft[j*16 + l] = left[l*4 + j]; aRight[j*16 + l] = right[l*4 + j]; } } for (int i = 0; i < n; i++) { _mm_prefetch((const char*) &x1[span*(i+8)], _MM_HINT_T1); _mm_prefetch((const char*) &x1[span*(i+8) + 8], _MM_HINT_T1); _mm_prefetch((const char*) &x2[span*(i+8)], _MM_HINT_T1); _mm_prefetch((const char*) &x2[span*(i+8) + 8], _MM_HINT_T1); _mm_prefetch((const char*) &x3[span*(i+8)], _MM_HINT_ET1); _mm_prefetch((const char*) &x3[span*(i+8) + 8], _MM_HINT_ET1); _mm_prefetch((const char*) &x1[span*(i+1)], _MM_HINT_T0); _mm_prefetch((const char*) &x1[span*(i+1) + 8], _MM_HINT_T0); _mm_prefetch((const char*) &x2[span*(i+1)], _MM_HINT_T0); _mm_prefetch((const char*) &x2[span*(i+1) + 8], _MM_HINT_T0); _mm_prefetch((const char*) &x3[span*(i+1)], _MM_HINT_ET0); _mm_prefetch((const char*) &x3[span*(i+1) + 8], _MM_HINT_ET0); double uX1[16] __attribute__((align(64))); double uX2[16] __attribute__((align(64))); double uX[16] __attribute__((align(64))); for(int l = 0; l < 16; l++) { uX1[l] = 0.; uX2[l] = 0.; } double aV1[64] __attribute__((align(64))); double aV2[64] __attribute__((align(64))); const double* v1 = &(x1[span * i]); const double* v2 = &(x2[span * i]); mic_broadcast16x64(v1, aV1); mic_broadcast16x64(v2, aV2); for(int j = 0; j < 4; j++) { #pragma ivdep #pragma vector aligned for(int l = 0; l < 16; l++) { uX1[l] += aV1[j*16 + l] * aLeft[j*16 + l]; uX2[l] += aV2[j*16 + l] * aRight[j*16 + l]; } } double* v3 = &(x3[span * i]); #pragma ivdep #pragma vector aligned for(int l = 0; l < 16; ++l) { uX[l] = uX1[l] * uX2[l]; v3[l] = 0.; } double auX[64] __attribute__((align(64))); mic_broadcast16x64(uX, auX); for(int j = 0; j < 4; ++j) { #pragma ivdep #pragma vector aligned for(int k = 0; k < 16; ++k) { v3[k] += auX[j*16 + k] * aEV[j*16 + k]; } } __m512d t1 = _mm512_load_pd(&v3[0]); t1 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t1), absMask_MIC)); double vmax1 = _mm512_reduce_gmax_pd(t1); __m512d t2 = _mm512_load_pd(&v3[8]); t2 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t2), absMask_MIC)); double vmax2 = _mm512_reduce_gmax_pd(t2); if(vmax1 < PLL_MINLIKELIHOOD && vmax2 < PLL_MINLIKELIHOOD) { t1 = _mm512_mul_pd(t1, twotothe256_MIC); _mm512_store_pd(&v3[0], t1); t2 = _mm512_mul_pd(t2, twotothe256_MIC); _mm512_store_pd(&v3[8], t2); if(!fastScaling) ex3[i] += 1; else addScale += wgt[i]; } } } break; default: // assert(0); break; } /* as above, increment the global counter that counts scaling multiplications by the scaling multiplications carried out for computing the likelihood array at node p */ if (fastScaling) { *scalerIncrement = addScale; } }
inline F64vec8 load(const double *r) { return _mm512_load_pd(r); }
inline void load_aligned(const double *data) { SHORTVEC_ASSERT_ALIGNED(data, 64); val = _mm512_load_pd(data); }