int main() { __m512d t0,t1; double d1[8] __attribute__ ((aligned(64))); double d2[8] __attribute__ ((aligned(64))); double d3[8] __attribute__ ((aligned(64))); for(int i=0; i<8; i++) { d1[i]= i*1.0; d2[i]= 0.0; d3[i] = d1[i]; } //printf("testing intialization of registers\n"); //_mm512_store_pd(d1,t0); //printf("d1=t0: %f %f %f %f %f %f %f %f \n",d1[0],d1[1],d1[2],d1[3],d1[4],d1[5],d1[6],d1[7]); //_mm512_store_pd(d1,t1); //printf("d1=t1: %f %f %f %f %f %f %f %f \n",d1[0],d1[1],d1[2],d1[3],d1[4],d1[5],d1[6],d1[7]); t0 = _mm512_load_pd(d1); printf("permute backward\n"); t1 = (__m512d) _mm512_permute4f128_epi32 ( (__m512i) t0, 0b00111001); _mm512_store_pd(d2,t1); printf("d1: %f %f %f %f %f %f %f %f \n",d1[0],d1[1],d1[2],d1[3],d1[4],d1[5],d1[6],d1[7]); printf("d2: %f %f %f %f %f %f %f %f \n",d2[0],d2[1],d2[2],d2[3],d2[4],d2[5],d2[6],d2[7]); printf("permute forward\n"); t1 = (__m512d) _mm512_permute4f128_epi32 ( (__m512i) t0, 0b10010011); _mm512_store_pd(d2,t1); printf("d1: %f %f %f %f %f %f %f %f \n",d1[0],d1[1],d1[2],d1[3],d1[4],d1[5],d1[6],d1[7]); printf("d2: %f %f %f %f %f %f %f %f \n",d2[0],d2[1],d2[2],d2[3],d2[4],d2[5],d2[6],d2[7]); int __attribute__((aligned(64))) order[16]={0,1,0,1,4,5,6,7,8,9,10,11,12,13,14,15}; __m512i morder = _mm512_load_epi32(order); printf("permuting doubles\n"); t1 = (__m512d) _mm512_permutevar_epi32 (morder, (__m512i) t0); _mm512_store_pd(d2,t1); printf("d1: %f %f %f %f %f %f %f %f \n",d1[0],d1[1],d1[2],d1[3],d1[4],d1[5],d1[6],d1[7]); printf("d2: %f %f %f %f %f %f %f %f \n",d2[0],d2[1],d2[2],d2[3],d2[4],d2[5],d2[6],d2[7]); return 0; }
void mad12(int num, DT* data, int repeat, DT vv1, DT vv2) { int gid, j; #ifndef MIC #pragma omp parallel for for (gid = 0; gid<num; gid++) { register DT s = (DT)(0.999f); register DT v1 = vv1; register DT v2 = vv2; for (j=0; j<repeat; ++j) { MADD1_MOP2 } data[gid] = s; } #else #pragma omp parallel for for (gid = 0; gid<num; gid=gid+STEP) { __m512d s = _mm512_set1_pd(0.999f); __m512d v1 = _mm512_set1_pd(vv1); __m512d v2 = _mm512_set1_pd(vv2); for (j=0; j<repeat; ++j) { MADD1_MOP2 } _mm512_store_pd(&(data[gid]), s); } #endif return ; }
LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE void stream_vector_set( const double i_scalar, double* io_c, const int i_length) { int l_n = 0; int l_trip_prolog = 0; int l_trip_stream = 0; /* init the trip counts */ stream_init( i_length, (size_t)io_c, &l_trip_prolog, &l_trip_stream ); /* run the prologue */ for ( ; l_n < l_trip_prolog; l_n++ ) { io_c[l_n] = i_scalar; } /* run the bulk, hopefully using streaming stores */ #if defined(__SSE3__) && defined(__AVX__) && !defined(__AVX512F__) { /* we need manual unrolling as the compiler otherwise generates too many dependencies */ const __m256d vec_scalar = _mm256_broadcast_sd(&i_scalar); for ( ; l_n < l_trip_stream; l_n+=8 ) { #ifdef DISABLE_NONTEMPORAL_STORES _mm256_store_pd( &(io_c[l_n]), vec_scalar ); _mm256_store_pd( &(io_c[l_n+4]), vec_scalar ); #else _mm256_stream_pd( &(io_c[l_n]), vec_scalar ); _mm256_stream_pd( &(io_c[l_n+4]), vec_scalar ); #endif } } #elif defined(__SSE3__) && defined(__AVX__) && defined(__AVX512F__) { const __m512d vec_scalar = _mm512_broadcastsd_pd(_mm_load_sd(&i_scalar)); for ( ; l_n < l_trip_stream; l_n+=8 ) { #ifdef DISABLE_NONTEMPORAL_STORES _mm512_store_pd( &(io_c[l_n]), vec_scalar ); #else _mm512_stream_pd( &(io_c[l_n]), vec_scalar ); #endif } } #else for ( ; l_n < l_trip_stream; l_n++ ) { io_c[l_n] = i_scalar; } #endif /* run the epilogue */ for ( ; l_n < i_length; l_n++ ) { io_c[l_n] = i_scalar; } }
void extern avx512f_test (void) { x1 = _mm512_mask_mov_pd (x1, m, x2); x1 = _mm512_maskz_mov_pd (m, x2); x1 = _mm512_load_pd (p); x1 = _mm512_mask_load_pd (x1, m, p); x1 = _mm512_maskz_load_pd (m, p); _mm512_store_pd (p, x1); _mm512_mask_store_pd (p, m, x1); }
__inline void mic_broadcast16x64(const double* inv, double* outv) { __mmask8 k1 = _mm512_int2mask(0x0F); __mmask8 k2 = _mm512_int2mask(0xF0); for(int l = 0; l < 16; l += 2) { __m512d t = _mm512_setzero_pd(); t = _mm512_mask_extload_pd(t, k1, &inv[(l%4)*4 + l/4], _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE); t = _mm512_mask_extload_pd(t, k2, &inv[((l+1)%4)*4 + (l+1)/4], _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE); _mm512_store_pd(&outv[l*4], t); } }
void ks_rank_k_int_d16x14( int k, double *a, double *b, double *c, int ldc, aux_t *aux ) { int i; double neg2 = -2.0; v8df_t c007_0, c007_1, c007_2, c007_3, c007_4; v8df_t c007_5, c007_6, c007_7, c007_8, c007_9; v8df_t c007_10, c007_11, c007_12, c007_13; v8df_t c815_0, c815_1, c815_2, c815_3, c815_4; v8df_t c815_5, c815_6, c815_7, c815_8, c815_9; v8df_t c815_10, c815_11, c815_12, c815_13; v8df_t a007, a815, b_tmp; int k_iter = k; // TODO: need to clean the c buffer. for ( i = 0; i < k_iter; ++ i ) { a007.v = _mm512_load_pd( a ); a815.v = _mm512_load_pd( a + 8 ); //printf( "%lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf\n", // a007.d[ 0 ], a007.d[ 1 ], a007.d[ 2 ], a007.d[ 3 ], // a007.d[ 4 ], a007.d[ 5 ], a007.d[ 6 ], a007.d[ 7 ] ); b_tmp.v = _mm512_extload_pd( b, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); //printf( "b[ 0 ] = %lf\n", b[ 0 ] ); //printf( "%lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf\n", // b_tmp.d[ 0 ], b_tmp.d[ 1 ], b_tmp.d[ 2 ], b_tmp.d[ 3 ], // b_tmp.d[ 4 ], b_tmp.d[ 5 ], b_tmp.d[ 6 ], b_tmp.d[ 7 ] ); c007_0.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_0.v ); c815_0.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_0.v ); b_tmp.v = _mm512_extload_pd( b + 1, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_1.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_1.v ); c815_1.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_1.v ); b_tmp.v = _mm512_extload_pd( b + 2, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_2.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_2.v ); c815_2.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_2.v ); b_tmp.v = _mm512_extload_pd( b + 3, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_3.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_3.v ); c815_3.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_3.v ); b_tmp.v = _mm512_extload_pd( b + 4, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_4.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_4.v ); c815_4.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_4.v ); b_tmp.v = _mm512_extload_pd( b + 5, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_5.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_5.v ); c815_5.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_5.v ); b_tmp.v = _mm512_extload_pd( b + 6, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_6.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_6.v ); c815_6.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_6.v ); b_tmp.v = _mm512_extload_pd( b + 7, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_7.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_7.v ); c815_7.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_7.v ); b_tmp.v = _mm512_extload_pd( b + 8, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_8.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_8.v ); c815_8.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_8.v ); b_tmp.v = _mm512_extload_pd( b + 9, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_9.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_9.v ); c815_9.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_9.v ); b_tmp.v = _mm512_extload_pd( b + 10, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_10.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_10.v ); c815_10.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_10.v ); b_tmp.v = _mm512_extload_pd( b + 11, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_11.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_11.v ); c815_11.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_11.v ); b_tmp.v = _mm512_extload_pd( b + 12, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_12.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_12.v ); c815_12.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_12.v ); b_tmp.v = _mm512_extload_pd( b + 13, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 ); c007_13.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_13.v ); c815_13.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_13.v ); a += 16; b += 16; } // simulate kernel summation c007_0.v = _mm512_add_pd( c007_0.v, c007_1.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_1.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_2.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_2.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_3.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_3.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_4.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_4.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_5.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_5.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_6.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_6.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_7.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_7.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_8.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_8.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_9.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_9.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_10.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_10.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_11.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_11.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_12.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_12.v ); c007_0.v = _mm512_add_pd( c007_0.v, c007_13.v ); c815_0.v = _mm512_add_pd( c815_0.v, c815_13.v ); // if ( aux->pc != 0 ) { // // // packed // tmpc03_0.v = _mm256_load_pd( (double*)( c ) ); // tmpc47_0.v = _mm256_load_pd( (double*)( c + 4 ) ); // // tmpc03_1.v = _mm256_load_pd( (double*)( c + 8 ) ); // tmpc47_1.v = _mm256_load_pd( (double*)( c + 12 ) ); // // tmpc03_2.v = _mm256_load_pd( (double*)( c + 16 ) ); // tmpc47_2.v = _mm256_load_pd( (double*)( c + 20 ) ); // // tmpc03_3.v = _mm256_load_pd( (double*)( c + 24 ) ); // tmpc47_3.v = _mm256_load_pd( (double*)( c + 28 ) ); // // // c03_0.v = _mm256_add_pd( tmpc03_0.v, c03_0.v ); // c47_0.v = _mm256_add_pd( tmpc47_0.v, c47_0.v ); // // c03_1.v = _mm256_add_pd( tmpc03_1.v, c03_1.v ); // c47_1.v = _mm256_add_pd( tmpc47_1.v, c47_1.v ); // // c03_2.v = _mm256_add_pd( tmpc03_2.v, c03_2.v ); // c47_2.v = _mm256_add_pd( tmpc47_2.v, c47_2.v ); // // c03_3.v = _mm256_add_pd( tmpc03_3.v, c03_3.v ); // c47_3.v = _mm256_add_pd( tmpc47_3.v, c47_3.v ); // } // // // packed _mm512_store_pd( c , c007_0.v ); _mm512_store_pd( c + 8 , c815_0.v ); // _mm512_store_pd( c + 16, c007_1.v ); // _mm512_store_pd( c + 24, c815_1.v ); // // _mm512_store_pd( c + 32, c007_2.v ); // _mm512_store_pd( c + 40, c815_2.v ); // // _mm512_store_pd( c + 48, c007_3.v ); // _mm512_store_pd( c + 56, c815_3.v ); // // _mm512_store_pd( c + 64, c007_4.v ); // _mm512_store_pd( c + 72, c815_4.v ); // // _mm512_store_pd( c + 80, c007_5.v ); // _mm512_store_pd( c + 88, c815_5.v ); // // _mm512_store_pd( c + 96, c007_6.v ); // _mm512_store_pd( c + 104, c815_6.v ); // // _mm512_store_pd( c + 112, c007_7.v ); // _mm512_store_pd( c + 120, c815_7.v ); // // _mm512_store_pd( c + 128, c007_8.v ); // _mm512_store_pd( c + 136, c815_8.v ); // // _mm512_store_pd( c + 144, c007_9.v ); // _mm512_store_pd( c + 152, c815_9.v ); // // _mm512_store_pd( c + 160, c007_10.v ); // _mm512_store_pd( c + 168, c815_10.v ); // // _mm512_store_pd( c + 176, c007_11.v ); // _mm512_store_pd( c + 184, c815_11.v ); // // _mm512_store_pd( c + 192, c007_12.v ); // _mm512_store_pd( c + 200, c815_12.v ); // // _mm512_store_pd( c + 208, c007_13.v ); // _mm512_store_pd( c + 216, c815_13.v ); //printf( "ldc = %d\n", ldc ); // printf( "%lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf\n", // c007_0.d[ 0 ], c007_0.d[ 1 ], c007_0.d[ 2 ], c007_0.d[ 3 ], // c007_0.d[ 4 ], c007_0.d[ 5 ], c007_0.d[ 6 ], c007_0.d[ 7 ] ); // //printf( "%lf, %lf, %lf, %lf\n", c[1], c[ ldc + 1], c[ ldc * 2 + 1], c[ ldc * 3 + 1] ); //printf( "%lf, %lf, %lf, %lf\n", c[2], c[ ldc + 2], c[ ldc * 2 + 2], c[ ldc * 3 + 2] ); //printf( "%lf, %lf, %lf, %lf\n", c[3], c[ ldc + 3], c[ ldc * 2 + 3], c[ ldc * 3 + 3] ); //printf( "%lf, %lf, %lf, %lf\n", c[4], c[ ldc + 4], c[ ldc * 2 + 4], c[ ldc * 3 + 4] ); //printf( "%lf, %lf, %lf, %lf\n", c[5], c[ ldc + 5], c[ ldc * 2 + 5], c[ ldc * 3 + 5] ); //printf( "%lf, %lf, %lf, %lf\n", c[6], c[ ldc + 6], c[ ldc * 2 + 6], c[ ldc * 3 + 6] ); //printf( "%lf, %lf, %lf, %lf\n", c[7], c[ ldc + 7], c[ ldc * 2 + 7], c[ ldc * 3 + 7] ); }
void newviewGTRGAMMA_MIC(int tipCase, double *x1, double *x2, double *x3, double *extEV, double *tipVector, int *ex3, unsigned char *tipX1, unsigned char *tipX2, int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling) { __m512d minlikelihood_MIC = _mm512_set1_pd(PLL_MINLIKELIHOOD); __m512d twotothe256_MIC = _mm512_set1_pd(PLL_TWOTOTHE256); __m512i absMask_MIC = _mm512_set1_epi64(0x7fffffffffffffffULL); int addScale = 0; double aEV[64] __attribute__((align(PLL_BYTE_ALIGNMENT))); #pragma ivdep for (int l = 0; l < 64; ++l) { aEV[l] = extEV[(l / 16) * 4 + (l % 4)]; } switch(tipCase) { case PLL_TIP_TIP: { /* multiply all possible tip state vectors with the respective P-matrices */ double umpX1[256] __attribute__((align(PLL_BYTE_ALIGNMENT))); double umpX2[256] __attribute__((align(PLL_BYTE_ALIGNMENT))); for(int k = 0; k < 256; ++k) { umpX1[k] = 0.0; umpX2[k] = 0.0; } for(int i = 0; i < maxStateValue; ++i) { for(int l = 0; l < states; ++l) { #pragma ivdep for(int k = 0; k < span; ++k) { umpX1[16 * i + k] += tipVector[i * 4 + l] * left[k * 4 + l]; umpX2[16 * i + k] += tipVector[i * 4 + l] * right[k * 4 + l]; } } } double auX[64] __attribute__((align(64))); for(int i = 0; i < n; ++i) { _mm_prefetch((const char*) (const char*) &x3[span*(i+8)], _MM_HINT_ET1); _mm_prefetch((const char*) &x3[span*(i+8) + 8], _MM_HINT_ET1); _mm_prefetch((const char*) &x3[span*(i+1)], _MM_HINT_ET0); _mm_prefetch((const char*) &x3[span*(i+1) + 8], _MM_HINT_ET0); const double *uX1 = &umpX1[16 * tipX1[i]]; const double *uX2 = &umpX2[16 * tipX2[i]]; double uX[16] __attribute__((align(PLL_BYTE_ALIGNMENT))); double* v = &x3[i * 16]; #pragma ivdep #pragma vector aligned for(int l = 0; l < 16; ++l) { uX[l] = uX1[l] * uX2[l]; v[l] = 0.; } mic_broadcast16x64(uX, auX); for (int j = 0; j < 4; ++j) { #pragma ivdep #pragma vector aligned #pragma vector nontemporal for(int k = 0; k < 16; ++k) { v[k] += auX[j*16 + k] * aEV[j*16 + k]; } } // init scaling counter for the site if (!fastScaling) ex3[i] = 0; } // sites loop } break; case PLL_TIP_INNER: { /* we do analogous pre-computations as above, with the only difference that we now do them only for one tip vector */ double umpX1[256] __attribute__((align(PLL_BYTE_ALIGNMENT))); /* precompute P and left tip vector product */ for(int k = 0; k < 256; ++k) { umpX1[k] = 0.0; } for(int i = 0; i < 16; ++i) { for(int l = 0; l < 4; ++l) { #pragma ivdep for(int k = 0; k < 16; ++k) { umpX1[16 * i + k] += tipVector[i * 4 + l] * left[k * 4 + l]; } } } // re-arrange right matrix for better memory layout double aRight[64] __attribute__((align(PLL_BYTE_ALIGNMENT))); for(int j = 0; j < 4; j++) { for(int l = 0; l < 16; l++) { aRight[j*16 + l] = right[l*4 + j]; } } for (int i = 0; i < n; i++) { _mm_prefetch((const char*) &x2[span*(i+16)], _MM_HINT_T1); _mm_prefetch((const char*) &x2[span*(i+16) + 8], _MM_HINT_T1); _mm_prefetch((const char*) &x3[span*(i+16)], _MM_HINT_ET1); _mm_prefetch((const char*) &x3[span*(i+16) + 8], _MM_HINT_ET1); _mm_prefetch((const char*) &x2[span*(i+1)], _MM_HINT_T0); _mm_prefetch((const char*) &x2[span*(i+1) + 8], _MM_HINT_T0); _mm_prefetch((const char*) &x3[span*(i+1)], _MM_HINT_ET0); _mm_prefetch((const char*) &x3[span*(i+1) + 8], _MM_HINT_ET0); /* access pre-computed value based on the raw sequence data tipX1 that is used as an index */ double* uX1 = &umpX1[span * tipX1[i]]; double uX2[16] __attribute__((align(PLL_BYTE_ALIGNMENT))); double uX[16] __attribute__((align(PLL_BYTE_ALIGNMENT))); #pragma vector aligned for(int l = 0; l < 16; ++l) { uX2[l] = 0.; } double aV2[64] __attribute__((align(PLL_BYTE_ALIGNMENT))); const double* v2 = &(x2[16 * i]); mic_broadcast16x64(v2, aV2); for(int j = 0; j < 4; j++) { #pragma ivdep #pragma vector aligned for(int l = 0; l < 16; l++) { uX2[l] += aV2[j*16 + l] * aRight[j*16 + l]; } } double* v3 = &(x3[span * i]); #pragma ivdep #pragma vector aligned for(int l = 0; l < 16; ++l) { uX[l] = uX1[l] * uX2[l]; v3[l] = 0.; } double auX[64] __attribute__((align(PLL_BYTE_ALIGNMENT))); mic_broadcast16x64(uX, auX); for (int j = 0; j < 4; ++j) { #pragma ivdep #pragma vector aligned for(int k = 0; k < 16; ++k) { v3[k] += auX[j*16 + k] * aEV[j*16 + k]; } } __m512d t1 = _mm512_load_pd(&v3[0]); t1 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t1), absMask_MIC)); double vmax1 = _mm512_reduce_gmax_pd(t1); __m512d t2 = _mm512_load_pd(&v3[8]); t2 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t2), absMask_MIC)); double vmax2 = _mm512_reduce_gmax_pd(t2); if(vmax1 < PLL_MINLIKELIHOOD && vmax2 < PLL_MINLIKELIHOOD) { t1 = _mm512_mul_pd(t1, twotothe256_MIC); _mm512_store_pd(&v3[0], t1); t2 = _mm512_mul_pd(t2, twotothe256_MIC); _mm512_store_pd(&v3[8], t2); if(!fastScaling) ex3[i] += 1; else addScale += wgt[i]; } } // site loop } break; case PLL_INNER_INNER: { /* same as above, without pre-computations */ // re-arrange right matrix for better memory layout double aLeft[64] __attribute__((align(PLL_BYTE_ALIGNMENT))); double aRight[64] __attribute__((align(PLL_BYTE_ALIGNMENT))); for(int j = 0; j < 4; j++) { for(int l = 0; l < 16; l++) { aLeft[j*16 + l] = left[l*4 + j]; aRight[j*16 + l] = right[l*4 + j]; } } for (int i = 0; i < n; i++) { _mm_prefetch((const char*) &x1[span*(i+8)], _MM_HINT_T1); _mm_prefetch((const char*) &x1[span*(i+8) + 8], _MM_HINT_T1); _mm_prefetch((const char*) &x2[span*(i+8)], _MM_HINT_T1); _mm_prefetch((const char*) &x2[span*(i+8) + 8], _MM_HINT_T1); _mm_prefetch((const char*) &x3[span*(i+8)], _MM_HINT_ET1); _mm_prefetch((const char*) &x3[span*(i+8) + 8], _MM_HINT_ET1); _mm_prefetch((const char*) &x1[span*(i+1)], _MM_HINT_T0); _mm_prefetch((const char*) &x1[span*(i+1) + 8], _MM_HINT_T0); _mm_prefetch((const char*) &x2[span*(i+1)], _MM_HINT_T0); _mm_prefetch((const char*) &x2[span*(i+1) + 8], _MM_HINT_T0); _mm_prefetch((const char*) &x3[span*(i+1)], _MM_HINT_ET0); _mm_prefetch((const char*) &x3[span*(i+1) + 8], _MM_HINT_ET0); double uX1[16] __attribute__((align(64))); double uX2[16] __attribute__((align(64))); double uX[16] __attribute__((align(64))); for(int l = 0; l < 16; l++) { uX1[l] = 0.; uX2[l] = 0.; } double aV1[64] __attribute__((align(64))); double aV2[64] __attribute__((align(64))); const double* v1 = &(x1[span * i]); const double* v2 = &(x2[span * i]); mic_broadcast16x64(v1, aV1); mic_broadcast16x64(v2, aV2); for(int j = 0; j < 4; j++) { #pragma ivdep #pragma vector aligned for(int l = 0; l < 16; l++) { uX1[l] += aV1[j*16 + l] * aLeft[j*16 + l]; uX2[l] += aV2[j*16 + l] * aRight[j*16 + l]; } } double* v3 = &(x3[span * i]); #pragma ivdep #pragma vector aligned for(int l = 0; l < 16; ++l) { uX[l] = uX1[l] * uX2[l]; v3[l] = 0.; } double auX[64] __attribute__((align(64))); mic_broadcast16x64(uX, auX); for(int j = 0; j < 4; ++j) { #pragma ivdep #pragma vector aligned for(int k = 0; k < 16; ++k) { v3[k] += auX[j*16 + k] * aEV[j*16 + k]; } } __m512d t1 = _mm512_load_pd(&v3[0]); t1 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t1), absMask_MIC)); double vmax1 = _mm512_reduce_gmax_pd(t1); __m512d t2 = _mm512_load_pd(&v3[8]); t2 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t2), absMask_MIC)); double vmax2 = _mm512_reduce_gmax_pd(t2); if(vmax1 < PLL_MINLIKELIHOOD && vmax2 < PLL_MINLIKELIHOOD) { t1 = _mm512_mul_pd(t1, twotothe256_MIC); _mm512_store_pd(&v3[0], t1); t2 = _mm512_mul_pd(t2, twotothe256_MIC); _mm512_store_pd(&v3[8], t2); if(!fastScaling) ex3[i] += 1; else addScale += wgt[i]; } } } break; default: // assert(0); break; } /* as above, increment the global counter that counts scaling multiplications by the scaling multiplications carried out for computing the likelihood array at node p */ if (fastScaling) { *scalerIncrement = addScale; } }
inline void store(double *r, const F64vec8 v) { _mm512_store_pd(r, v); }
LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE void stream_update_helmholtz_no_h2( const double* i_g1, const double* i_g2, const double* i_g3, const double* i_tm1, const double* i_tm2, const double* i_tm3, double* io_c, const double i_h1, const int i_length) { int l_n = 0; int l_trip_prolog = 0; int l_trip_stream = 0; /* init the trip counts */ stream_init( i_length, (size_t)io_c, &l_trip_prolog, &l_trip_stream ); /* run the prologue */ for ( ; l_n < l_trip_prolog; l_n++ ) { io_c[l_n] = i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]); } /* run the bulk, hopefully using streaming stores */ #if defined(__SSE3__) && defined(__AVX__) && !defined(__AVX512F__) { const __m256d vec_h1 = _mm256_broadcast_sd(&i_h1); /* we need manual unrolling as the compiler otherwise generates too many dependencies */ for ( ; l_n < l_trip_stream; l_n+=8 ) { __m256d vec_g1_1, vec_g2_1, vec_g3_1, vec_tm1_1, vec_tm2_1, vec_tm3_1; __m256d vec_g1_2, vec_g2_2, vec_g3_2, vec_tm1_2, vec_tm2_2, vec_tm3_2; vec_g1_1 = _mm256_loadu_pd(&(i_g1[l_n])); vec_tm1_1 = _mm256_loadu_pd(&(i_tm1[l_n])); vec_g1_2 = _mm256_loadu_pd(&(i_g1[l_n+4])); vec_tm1_2 = _mm256_loadu_pd(&(i_tm1[l_n+4])); vec_g1_1 = _mm256_mul_pd(vec_g1_1, vec_tm1_1); vec_g2_1 = _mm256_loadu_pd(&(i_g2[l_n])); vec_g1_2 = _mm256_mul_pd(vec_g1_2, vec_tm1_2); vec_g2_2 = _mm256_loadu_pd(&(i_g2[l_n+4])); vec_tm2_1 = _mm256_loadu_pd(&(i_tm2[l_n])); vec_g2_1 = _mm256_mul_pd(vec_g2_1, vec_tm2_1); vec_tm2_2 = _mm256_loadu_pd(&(i_tm2[l_n+4])); vec_g2_2 = _mm256_mul_pd(vec_g2_2, vec_tm2_2); vec_g3_1 = _mm256_loadu_pd(&(i_g3[l_n])); vec_tm3_1 = _mm256_loadu_pd(&(i_tm3[l_n])); vec_g3_2 = _mm256_loadu_pd(&(i_g3[l_n+4])); vec_tm3_2 = _mm256_loadu_pd(&(i_tm3[l_n+4])); vec_g3_1 = _mm256_mul_pd(vec_g3_1, vec_tm3_1); vec_g3_2 = _mm256_mul_pd(vec_g3_2, vec_tm3_2); vec_g1_1 = _mm256_add_pd(vec_g1_1, vec_g2_1); vec_g1_2 = _mm256_add_pd(vec_g1_2, vec_g2_2); vec_g1_1 = _mm256_add_pd(vec_g1_1, vec_g3_1); #ifdef DISABLE_NONTEMPORAL_STORES _mm256_store_pd( &(io_c[l_n]), _mm256_mul_pd(vec_g1_1, vec_h1) ); #else _mm256_stream_pd( &(io_c[l_n]), _mm256_mul_pd(vec_g1_1, vec_h1) ); #endif vec_g1_2 = _mm256_add_pd(vec_g1_2, vec_g3_2); #ifdef DISABLE_NONTEMPORAL_STORES _mm256_store_pd( &(io_c[l_n+4]), _mm256_mul_pd(vec_g1_2, vec_h1) ); #else _mm256_stream_pd( &(io_c[l_n+4]), _mm256_mul_pd(vec_g1_2, vec_h1) ); #endif } } #elif defined(__SSE3__) && defined(__AVX__) && defined(__AVX512F__) { const __m512d vec_h1 = _mm512_broadcastsd_pd(_mm_load_sd(&i_h1)); for ( ; l_n < l_trip_stream; l_n+=8 ) { __m512d vec_g1, vec_g2, vec_g3, vec_tm1, vec_tm2, vec_tm3; vec_g1 = _mm512_loadu_pd(&(i_g1[l_n])); vec_tm1 = _mm512_loadu_pd(&(i_tm1[l_n])); vec_g1 = _mm512_mul_pd(vec_g1, vec_tm1); vec_g2 = _mm512_loadu_pd(&(i_g2[l_n])); vec_tm2 = _mm512_loadu_pd(&(i_tm2[l_n])); vec_g2 = _mm512_mul_pd(vec_g2, vec_tm2); vec_g3 = _mm512_loadu_pd(&(i_g3[l_n])); vec_tm3 = _mm512_loadu_pd(&(i_tm3[l_n])); vec_g3 = _mm512_mul_pd(vec_g3, vec_tm3); vec_g1 = _mm512_add_pd(vec_g1, vec_g2); vec_g1 = _mm512_add_pd(vec_g1, vec_g3); #ifdef DISABLE_NONTEMPORAL_STORES _mm512_store_pd( &(io_c[l_n]), _mm512_mul_pd(vec_g1, vec_h1) ); #else _mm512_stream_pd( &(io_c[l_n]), _mm512_mul_pd(vec_g1, vec_h1) ); #endif } } #else for ( ; l_n < l_trip_stream; l_n++ ) { io_c[l_n] = i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]); } #endif /* run the epilogue */ for ( ; l_n < i_length; l_n++ ) { io_c[l_n] = i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]); } }
LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE void stream_vector_compscale( const double* i_a, const double* i_b, double* io_c, const int i_length) { int l_n = 0; int l_trip_prolog = 0; int l_trip_stream = 0; /* init the trip counts */ stream_init( i_length, (size_t)io_c, &l_trip_prolog, &l_trip_stream ); /* run the prologue */ for ( ; l_n < l_trip_prolog; l_n++ ) { io_c[l_n] = i_a[l_n]*i_b[l_n]; } /* run the bulk, hopefully using streaming stores */ #if defined(__SSE3__) && defined(__AVX__) && !defined(__AVX512F__) { /* we need manual unrolling as the compiler otherwise generates too many dependencies */ for ( ; l_n < l_trip_stream; l_n+=8 ) { __m256d vec_a_1, vec_b_1; __m256d vec_a_2, vec_b_2; vec_a_1 = _mm256_loadu_pd(&(i_a[l_n])); vec_a_2 = _mm256_loadu_pd(&(i_a[l_n+4])); vec_b_1 = _mm256_loadu_pd(&(i_b[l_n])); vec_b_2 = _mm256_loadu_pd(&(i_b[l_n+4])); #ifdef DISABLE_NONTEMPORAL_STORES _mm256_store_pd( &(io_c[l_n]), _mm256_mul_pd( vec_a_1, vec_b_1 ) ); _mm256_store_pd( &(io_c[l_n+4]), _mm256_mul_pd( vec_a_2, vec_b_2 ) ); #else _mm256_stream_pd( &(io_c[l_n]), _mm256_mul_pd( vec_a_1, vec_b_1 ) ); _mm256_stream_pd( &(io_c[l_n+4]), _mm256_mul_pd( vec_a_2, vec_b_2 ) ); #endif } } #elif defined(__SSE3__) && defined(__AVX__) && defined(__AVX512F__) { for ( ; l_n < l_trip_stream; l_n+=8 ) { __m512d vec_a, vec_b; vec_a = _mm512_loadu_pd(&(i_a[l_n])); vec_b = _mm512_loadu_pd(&(i_b[l_n])); #ifdef DISABLE_NONTEMPORAL_STORES _mm512_store_pd( &(io_c[l_n]), _mm512_mul_pd( vec_a, vec_b ) ); #else _mm512_stream_pd( &(io_c[l_n]), _mm512_mul_pd( vec_a, vec_b ) ); #endif } } #else for ( ; l_n < l_trip_stream; l_n++ ) { io_c[l_n] = i_a[l_n]*i_b[l_n]; } #endif /* run the epilogue */ for ( ; l_n < i_length; l_n++ ) { io_c[l_n] = i_a[l_n]*i_b[l_n]; } }
inline void store_aligned(double *data) const { SHORTVEC_ASSERT_ALIGNED(data, 64); _mm512_store_pd(data, val); }