Esempio n. 1
0
int main()
{
   __m512d  t0,t1;

   double d1[8] __attribute__ ((aligned(64)));   
   double d2[8] __attribute__ ((aligned(64)));   
   double d3[8] __attribute__ ((aligned(64)));   

   for(int i=0; i<8; i++)
   {
       d1[i]= i*1.0;

       d2[i]= 0.0;

       d3[i] = d1[i];

   }

   //printf("testing intialization of registers\n");
   //_mm512_store_pd(d1,t0);
   //printf("d1=t0: %f %f %f %f %f %f %f %f  \n",d1[0],d1[1],d1[2],d1[3],d1[4],d1[5],d1[6],d1[7]);
   //_mm512_store_pd(d1,t1);
   //printf("d1=t1: %f %f %f %f %f %f %f %f  \n",d1[0],d1[1],d1[2],d1[3],d1[4],d1[5],d1[6],d1[7]);


   t0 = _mm512_load_pd(d1);

   printf("permute backward\n");
   t1 = (__m512d)  _mm512_permute4f128_epi32 ( (__m512i) t0, 0b00111001);
   _mm512_store_pd(d2,t1);
   printf("d1: %f %f %f %f %f %f %f %f  \n",d1[0],d1[1],d1[2],d1[3],d1[4],d1[5],d1[6],d1[7]);
   printf("d2: %f %f %f %f %f %f %f %f  \n",d2[0],d2[1],d2[2],d2[3],d2[4],d2[5],d2[6],d2[7]);
    
   printf("permute forward\n");
   t1 = (__m512d)  _mm512_permute4f128_epi32 ( (__m512i) t0, 0b10010011);
   _mm512_store_pd(d2,t1);
   printf("d1: %f %f %f %f %f %f %f %f  \n",d1[0],d1[1],d1[2],d1[3],d1[4],d1[5],d1[6],d1[7]);
   printf("d2: %f %f %f %f %f %f %f %f  \n",d2[0],d2[1],d2[2],d2[3],d2[4],d2[5],d2[6],d2[7]);
   


   int __attribute__((aligned(64))) order[16]={0,1,0,1,4,5,6,7,8,9,10,11,12,13,14,15};

   __m512i morder = _mm512_load_epi32(order);


   printf("permuting doubles\n");
   t1 = (__m512d) _mm512_permutevar_epi32 (morder, (__m512i) t0);
   _mm512_store_pd(d2,t1);
   printf("d1: %f %f %f %f %f %f %f %f  \n",d1[0],d1[1],d1[2],d1[3],d1[4],d1[5],d1[6],d1[7]);
   printf("d2: %f %f %f %f %f %f %f %f  \n",d2[0],d2[1],d2[2],d2[3],d2[4],d2[5],d2[6],d2[7]);

 
   return 0;

}
Esempio n. 2
0
void mad12(int num, DT* data, int repeat, DT vv1, DT vv2)
{
	int gid, j;
#ifndef MIC
    #pragma omp parallel for 
    for (gid = 0; gid<num; gid++)
    {
        register DT s = (DT)(0.999f);
	register DT v1 = vv1;
	register DT v2 = vv2;
        for (j=0; j<repeat; ++j) 
        {
		MADD1_MOP2
        }
        data[gid] = s;
    }
#else
	#pragma omp parallel for 
    for (gid = 0; gid<num; gid=gid+STEP)
    {       
	__m512d s = _mm512_set1_pd(0.999f);
	__m512d v1 = _mm512_set1_pd(vv1);
	__m512d v2 = _mm512_set1_pd(vv2);
        for (j=0; j<repeat; ++j) 
        {
            MADD1_MOP2
        }

	_mm512_store_pd(&(data[gid]), s);
    }
#endif
	
	return ;
}
Esempio n. 3
0
LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE
void stream_vector_set( const double i_scalar,
                        double*       io_c,
                        const int     i_length) {
  int l_n = 0;
  int l_trip_prolog = 0;
  int l_trip_stream = 0;
  
  /* init the trip counts */
  stream_init( i_length, (size_t)io_c, &l_trip_prolog, &l_trip_stream );

  /* run the prologue */
  for ( ; l_n < l_trip_prolog;  l_n++ ) {
    io_c[l_n] = i_scalar;
  }
  /* run the bulk, hopefully using streaming stores */
#if defined(__SSE3__) && defined(__AVX__) && !defined(__AVX512F__)
  {
    /* we need manual unrolling as the compiler otherwise generates 
       too many dependencies */
    const __m256d vec_scalar = _mm256_broadcast_sd(&i_scalar);
    for ( ; l_n < l_trip_stream;  l_n+=8 ) {
#ifdef DISABLE_NONTEMPORAL_STORES
      _mm256_store_pd(  &(io_c[l_n]),   vec_scalar );
      _mm256_store_pd(  &(io_c[l_n+4]), vec_scalar );
#else
      _mm256_stream_pd( &(io_c[l_n]),   vec_scalar );
      _mm256_stream_pd( &(io_c[l_n+4]), vec_scalar );
#endif
    }
  }
#elif defined(__SSE3__) && defined(__AVX__) && defined(__AVX512F__)
  {
    const __m512d vec_scalar = _mm512_broadcastsd_pd(_mm_load_sd(&i_scalar));
    for ( ; l_n < l_trip_stream;  l_n+=8 ) {
#ifdef DISABLE_NONTEMPORAL_STORES
      _mm512_store_pd(  &(io_c[l_n]), vec_scalar );
#else
      _mm512_stream_pd( &(io_c[l_n]), vec_scalar );
#endif
    }
  }
#else
  for ( ; l_n < l_trip_stream;  l_n++ ) {
    io_c[l_n] = i_scalar;
  }
#endif
  /* run the epilogue */
  for ( ; l_n < i_length;  l_n++ ) {
    io_c[l_n] = i_scalar;
  }
}
Esempio n. 4
0
void extern
avx512f_test (void)
{
  x1 = _mm512_mask_mov_pd (x1, m, x2);
  x1 = _mm512_maskz_mov_pd (m, x2);

  x1 = _mm512_load_pd (p);
  x1 = _mm512_mask_load_pd (x1, m, p);
  x1 = _mm512_maskz_load_pd (m, p);

  _mm512_store_pd (p, x1);
  _mm512_mask_store_pd (p, m, x1);
}
Esempio n. 5
0
__inline void mic_broadcast16x64(const double* inv, double* outv)
{
    __mmask8 k1 = _mm512_int2mask(0x0F);
    __mmask8 k2 = _mm512_int2mask(0xF0);
    for(int l = 0; l < 16; l += 2)
    {
        __m512d t = _mm512_setzero_pd();
        t = _mm512_mask_extload_pd(t, k1, &inv[(l%4)*4 + l/4], _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE);
        t = _mm512_mask_extload_pd(t, k2, &inv[((l+1)%4)*4 + (l+1)/4], _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, _MM_HINT_NONE);

        _mm512_store_pd(&outv[l*4], t);
    }
}
Esempio n. 6
0
void ks_rank_k_int_d16x14(
    int    k,
    double *a,
    double *b,
    double *c,
    int    ldc,
    aux_t  *aux
    )
{
  int    i;
  double neg2 = -2.0;


  v8df_t c007_0,  c007_1,  c007_2,  c007_3,  c007_4;
  v8df_t c007_5,  c007_6,  c007_7,  c007_8,  c007_9;
  v8df_t c007_10, c007_11, c007_12, c007_13;

  v8df_t c815_0,  c815_1,  c815_2,  c815_3,  c815_4;
  v8df_t c815_5,  c815_6,  c815_7,  c815_8,  c815_9;
  v8df_t c815_10, c815_11, c815_12, c815_13;


  v8df_t a007, a815, b_tmp;

  int k_iter = k;


  // TODO: need to clean the c buffer.




  for ( i = 0; i < k_iter; ++ i ) { 
	a007.v    = _mm512_load_pd( a );
	a815.v    = _mm512_load_pd( a + 8 );

	//printf( "%lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf\n", 
	//	a007.d[ 0 ], a007.d[ 1 ], a007.d[ 2 ],  a007.d[ 3 ], 
	//	a007.d[ 4 ], a007.d[ 5 ], a007.d[ 6 ],  a007.d[ 7 ] );

	b_tmp.v   = _mm512_extload_pd( b, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 );

    //printf( "b[ 0 ] = %lf\n", b[ 0 ] );

	//printf( "%lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf\n", 
	//	b_tmp.d[ 0 ], b_tmp.d[ 1 ], b_tmp.d[ 2 ],  b_tmp.d[ 3 ], 
	//	b_tmp.d[ 4 ], b_tmp.d[ 5 ], b_tmp.d[ 6 ],  b_tmp.d[ 7 ] );



	c007_0.v  = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_0.v ); 
	c815_0.v  = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_0.v ); 
	
	b_tmp.v   = _mm512_extload_pd( b + 1, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 );
	c007_1.v  = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_1.v ); 
	c815_1.v  = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_1.v ); 

	b_tmp.v   = _mm512_extload_pd( b + 2, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 );
	c007_2.v  = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_2.v ); 
	c815_2.v  = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_2.v ); 

	b_tmp.v   = _mm512_extload_pd( b + 3, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 );
	c007_3.v  = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_3.v ); 
	c815_3.v  = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_3.v ); 

	b_tmp.v   = _mm512_extload_pd( b + 4, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 );
	c007_4.v  = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_4.v ); 
	c815_4.v  = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_4.v ); 

	b_tmp.v   = _mm512_extload_pd( b + 5, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 );
	c007_5.v  = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_5.v ); 
	c815_5.v  = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_5.v ); 

	b_tmp.v   = _mm512_extload_pd( b + 6, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 );
	c007_6.v  = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_6.v ); 
	c815_6.v  = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_6.v ); 

	b_tmp.v   = _mm512_extload_pd( b + 7, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 );
	c007_7.v  = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_7.v ); 
	c815_7.v  = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_7.v ); 

	b_tmp.v   = _mm512_extload_pd( b + 8, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 );
	c007_8.v  = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_8.v ); 
	c815_8.v  = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_8.v ); 

	b_tmp.v   = _mm512_extload_pd( b + 9, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 );
	c007_9.v  = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_9.v ); 
	c815_9.v  = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_9.v ); 

	b_tmp.v   = _mm512_extload_pd( b + 10, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 );
	c007_10.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_10.v ); 
	c815_10.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_10.v ); 

	b_tmp.v   = _mm512_extload_pd( b + 11, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 );
	c007_11.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_11.v ); 
	c815_11.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_11.v ); 

	b_tmp.v   = _mm512_extload_pd( b + 12, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 );
	c007_12.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_12.v ); 
	c815_12.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_12.v ); 

	b_tmp.v   = _mm512_extload_pd( b + 13, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 );
	c007_13.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_13.v ); 
	c815_13.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_13.v );

	a += 16;
	b += 16;
  }
 

  // simulate kernel summation
  c007_0.v  = _mm512_add_pd( c007_0.v, c007_1.v ); 
  c815_0.v  = _mm512_add_pd( c815_0.v, c815_1.v ); 

  c007_0.v  = _mm512_add_pd( c007_0.v, c007_2.v ); 
  c815_0.v  = _mm512_add_pd( c815_0.v, c815_2.v ); 

  c007_0.v  = _mm512_add_pd( c007_0.v, c007_3.v ); 
  c815_0.v  = _mm512_add_pd( c815_0.v, c815_3.v ); 

  c007_0.v  = _mm512_add_pd( c007_0.v, c007_4.v ); 
  c815_0.v  = _mm512_add_pd( c815_0.v, c815_4.v ); 

  c007_0.v  = _mm512_add_pd( c007_0.v, c007_5.v ); 
  c815_0.v  = _mm512_add_pd( c815_0.v, c815_5.v ); 

  c007_0.v  = _mm512_add_pd( c007_0.v, c007_6.v ); 
  c815_0.v  = _mm512_add_pd( c815_0.v, c815_6.v ); 

  c007_0.v  = _mm512_add_pd( c007_0.v, c007_7.v ); 
  c815_0.v  = _mm512_add_pd( c815_0.v, c815_7.v ); 

  c007_0.v  = _mm512_add_pd( c007_0.v, c007_8.v ); 
  c815_0.v  = _mm512_add_pd( c815_0.v, c815_8.v ); 

  c007_0.v  = _mm512_add_pd( c007_0.v, c007_9.v ); 
  c815_0.v  = _mm512_add_pd( c815_0.v, c815_9.v ); 

  c007_0.v  = _mm512_add_pd( c007_0.v, c007_10.v ); 
  c815_0.v  = _mm512_add_pd( c815_0.v, c815_10.v ); 

  c007_0.v  = _mm512_add_pd( c007_0.v, c007_11.v ); 
  c815_0.v  = _mm512_add_pd( c815_0.v, c815_11.v ); 

  c007_0.v  = _mm512_add_pd( c007_0.v, c007_12.v ); 
  c815_0.v  = _mm512_add_pd( c815_0.v, c815_12.v ); 

  c007_0.v  = _mm512_add_pd( c007_0.v, c007_13.v ); 
  c815_0.v  = _mm512_add_pd( c815_0.v, c815_13.v ); 










//  if ( aux->pc != 0 ) {
//
//    // packed
//    tmpc03_0.v = _mm256_load_pd( (double*)( c      ) );
//    tmpc47_0.v = _mm256_load_pd( (double*)( c + 4  ) );
//
//    tmpc03_1.v = _mm256_load_pd( (double*)( c + 8  ) );
//    tmpc47_1.v = _mm256_load_pd( (double*)( c + 12 ) );
//
//    tmpc03_2.v = _mm256_load_pd( (double*)( c + 16 ) );
//    tmpc47_2.v = _mm256_load_pd( (double*)( c + 20 ) );
//
//    tmpc03_3.v = _mm256_load_pd( (double*)( c + 24 ) );
//    tmpc47_3.v = _mm256_load_pd( (double*)( c + 28 ) );
//    
//
//    c03_0.v    = _mm256_add_pd( tmpc03_0.v, c03_0.v );
//    c47_0.v    = _mm256_add_pd( tmpc47_0.v, c47_0.v );
//
//    c03_1.v    = _mm256_add_pd( tmpc03_1.v, c03_1.v );
//    c47_1.v    = _mm256_add_pd( tmpc47_1.v, c47_1.v );
//
//    c03_2.v    = _mm256_add_pd( tmpc03_2.v, c03_2.v );
//    c47_2.v    = _mm256_add_pd( tmpc47_2.v, c47_2.v );
//
//    c03_3.v    = _mm256_add_pd( tmpc03_3.v, c03_3.v );
//    c47_3.v    = _mm256_add_pd( tmpc47_3.v, c47_3.v );
//  }
//
//
  // packed
  _mm512_store_pd( c     , c007_0.v );
  _mm512_store_pd( c + 8 , c815_0.v );

//  _mm512_store_pd( c + 16, c007_1.v );
//  _mm512_store_pd( c + 24, c815_1.v );
//
//  _mm512_store_pd( c + 32, c007_2.v );
//  _mm512_store_pd( c + 40, c815_2.v );
//
//  _mm512_store_pd( c + 48, c007_3.v );
//  _mm512_store_pd( c + 56, c815_3.v );
//
//  _mm512_store_pd( c + 64, c007_4.v );
//  _mm512_store_pd( c + 72, c815_4.v );
//
//  _mm512_store_pd( c + 80, c007_5.v );
//  _mm512_store_pd( c + 88, c815_5.v );
//
//  _mm512_store_pd( c + 96, c007_6.v );
//  _mm512_store_pd( c + 104, c815_6.v );
//
//  _mm512_store_pd( c + 112, c007_7.v );
//  _mm512_store_pd( c + 120, c815_7.v );
//
//  _mm512_store_pd( c + 128, c007_8.v );
//  _mm512_store_pd( c + 136, c815_8.v );
//
//  _mm512_store_pd( c + 144, c007_9.v );
//  _mm512_store_pd( c + 152, c815_9.v );
//
//  _mm512_store_pd( c + 160, c007_10.v );
//  _mm512_store_pd( c + 168, c815_10.v );
//
//  _mm512_store_pd( c + 176, c007_11.v );
//  _mm512_store_pd( c + 184, c815_11.v );
//
//  _mm512_store_pd( c + 192, c007_12.v );
//  _mm512_store_pd( c + 200, c815_12.v );
//
//  _mm512_store_pd( c + 208, c007_13.v );
//  _mm512_store_pd( c + 216, c815_13.v );



  //printf( "ldc = %d\n", ldc );
  

//  printf( "%lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf\n", 
//	  c007_0.d[ 0 ], c007_0.d[ 1 ], c007_0.d[ 2 ],  c007_0.d[ 3 ], 
//	  c007_0.d[ 4 ], c007_0.d[ 5 ], c007_0.d[ 6 ],  c007_0.d[ 7 ] );
//

  //printf( "%lf, %lf, %lf, %lf\n", c[1], c[ ldc + 1], c[ ldc * 2 + 1], c[ ldc * 3 + 1] );
  //printf( "%lf, %lf, %lf, %lf\n", c[2], c[ ldc + 2], c[ ldc * 2 + 2], c[ ldc * 3 + 2] );
  //printf( "%lf, %lf, %lf, %lf\n", c[3], c[ ldc + 3], c[ ldc * 2 + 3], c[ ldc * 3 + 3] );
  //printf( "%lf, %lf, %lf, %lf\n", c[4], c[ ldc + 4], c[ ldc * 2 + 4], c[ ldc * 3 + 4] );
  //printf( "%lf, %lf, %lf, %lf\n", c[5], c[ ldc + 5], c[ ldc * 2 + 5], c[ ldc * 3 + 5] );
  //printf( "%lf, %lf, %lf, %lf\n", c[6], c[ ldc + 6], c[ ldc * 2 + 6], c[ ldc * 3 + 6] );
  //printf( "%lf, %lf, %lf, %lf\n", c[7], c[ ldc + 7], c[ ldc * 2 + 7], c[ ldc * 3 + 7] );
}
Esempio n. 7
0
void newviewGTRGAMMA_MIC(int tipCase,
                  double *x1, double *x2, double *x3, double *extEV, double *tipVector,
                  int *ex3, unsigned char *tipX1, unsigned char *tipX2,
                  int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling)
{
    __m512d minlikelihood_MIC = _mm512_set1_pd(PLL_MINLIKELIHOOD);
    __m512d twotothe256_MIC = _mm512_set1_pd(PLL_TWOTOTHE256);
    __m512i absMask_MIC = _mm512_set1_epi64(0x7fffffffffffffffULL);

	int addScale = 0;

    double aEV[64] __attribute__((align(PLL_BYTE_ALIGNMENT)));

    #pragma ivdep
    for (int l = 0; l < 64; ++l)
    {
        aEV[l] = extEV[(l / 16) * 4 + (l % 4)];
    }

  switch(tipCase)
  {
    case PLL_TIP_TIP:
      {
        /* multiply all possible tip state vectors with the respective P-matrices
        */

            double umpX1[256] __attribute__((align(PLL_BYTE_ALIGNMENT)));
            double umpX2[256] __attribute__((align(PLL_BYTE_ALIGNMENT)));

            for(int k = 0; k < 256; ++k)
            {
                umpX1[k] = 0.0;
                umpX2[k] = 0.0;
            }

            for(int i = 0; i < maxStateValue; ++i)
            {
              for(int l = 0; l < states; ++l)
              {
                  #pragma ivdep
                  for(int k = 0; k < span; ++k)
                  {
                      umpX1[16 * i + k] +=  tipVector[i * 4 + l] *  left[k * 4 + l];
                      umpX2[16 * i + k] +=  tipVector[i * 4 + l] * right[k * 4 + l];
                  }
              }
            }

        double auX[64] __attribute__((align(64)));

        for(int i = 0; i < n; ++i)
        {
            _mm_prefetch((const char*) (const char*) &x3[span*(i+8)], _MM_HINT_ET1);
            _mm_prefetch((const char*) &x3[span*(i+8) + 8], _MM_HINT_ET1);

            _mm_prefetch((const char*) &x3[span*(i+1)], _MM_HINT_ET0);
            _mm_prefetch((const char*) &x3[span*(i+1) + 8], _MM_HINT_ET0);

            const double *uX1 = &umpX1[16 * tipX1[i]];
            const double *uX2 = &umpX2[16 * tipX2[i]];

            double uX[16] __attribute__((align(PLL_BYTE_ALIGNMENT)));
            double* v = &x3[i * 16];

            #pragma ivdep
            #pragma vector aligned
            for(int l = 0; l < 16; ++l)
            {
                uX[l] = uX1[l] * uX2[l];
                v[l] = 0.;
            }

            mic_broadcast16x64(uX, auX);

            for (int j = 0; j < 4; ++j)
            {
                #pragma ivdep
                #pragma vector aligned
                #pragma vector nontemporal
                for(int k = 0; k < 16; ++k)
                {
                    v[k] += auX[j*16 + k] * aEV[j*16 + k];
                }
            }

            // init scaling counter for the site
            if (!fastScaling)
                ex3[i] = 0;

        } // sites loop

      }
      break;
    case PLL_TIP_INNER:
      {
        /* we do analogous pre-computations as above, with the only difference that we now do them
        only for one tip vector */

          double umpX1[256] __attribute__((align(PLL_BYTE_ALIGNMENT)));

        /* precompute P and left tip vector product */

        for(int k = 0; k < 256; ++k)
        {
            umpX1[k] = 0.0;
        }

        for(int i = 0; i < 16; ++i)
        {
          for(int l = 0; l < 4; ++l)
          {
              #pragma ivdep
              for(int k = 0; k < 16; ++k)
              {
                  umpX1[16 * i + k] +=  tipVector[i * 4 + l] *  left[k * 4 + l];
              }
          }
        }

        // re-arrange right matrix for better memory layout
        double aRight[64] __attribute__((align(PLL_BYTE_ALIGNMENT)));
        for(int j = 0; j < 4; j++)
        {
            for(int l = 0; l < 16; l++)
            {
                aRight[j*16 + l] = right[l*4 + j];
            }
        }

        for (int i = 0; i < n; i++)
        {
            _mm_prefetch((const char*) &x2[span*(i+16)], _MM_HINT_T1);
            _mm_prefetch((const char*) &x2[span*(i+16) + 8], _MM_HINT_T1);
            _mm_prefetch((const char*) &x3[span*(i+16)], _MM_HINT_ET1);
            _mm_prefetch((const char*) &x3[span*(i+16) + 8], _MM_HINT_ET1);

            _mm_prefetch((const char*) &x2[span*(i+1)], _MM_HINT_T0);
            _mm_prefetch((const char*) &x2[span*(i+1) + 8], _MM_HINT_T0);
            _mm_prefetch((const char*) &x3[span*(i+1)], _MM_HINT_ET0);
            _mm_prefetch((const char*) &x3[span*(i+1) + 8], _MM_HINT_ET0);

            /* access pre-computed value based on the raw sequence data tipX1 that is used as an index */
            double* uX1 = &umpX1[span * tipX1[i]];
            double uX2[16] __attribute__((align(PLL_BYTE_ALIGNMENT)));
            double uX[16] __attribute__((align(PLL_BYTE_ALIGNMENT)));

            #pragma vector aligned
            for(int l = 0; l < 16; ++l)
            {
                uX2[l] = 0.;
            }

            double aV2[64] __attribute__((align(PLL_BYTE_ALIGNMENT)));
            const double* v2 = &(x2[16 * i]);

            mic_broadcast16x64(v2, aV2);

            for(int j = 0; j < 4; j++)
            {
                #pragma ivdep
                #pragma vector aligned
                for(int l = 0; l < 16; l++)
                {
                    uX2[l] += aV2[j*16 + l] * aRight[j*16 + l];
                }
            }

            double* v3 = &(x3[span * i]);

            #pragma ivdep
            #pragma vector aligned
            for(int l = 0; l < 16; ++l)
            {
                uX[l] = uX1[l] * uX2[l];
                v3[l] = 0.;
            }

            double auX[64] __attribute__((align(PLL_BYTE_ALIGNMENT)));
            mic_broadcast16x64(uX, auX);

            for (int j = 0; j < 4; ++j)
            {
                #pragma ivdep
                #pragma vector aligned
                for(int k = 0; k < 16; ++k)
                {
                    v3[k] += auX[j*16 + k] * aEV[j*16 + k];
                }
            }

            __m512d t1 = _mm512_load_pd(&v3[0]);
            t1 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t1), absMask_MIC));
            double vmax1 = _mm512_reduce_gmax_pd(t1);
            __m512d t2 = _mm512_load_pd(&v3[8]);
            t2 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t2), absMask_MIC));
            double vmax2 = _mm512_reduce_gmax_pd(t2);

            if(vmax1 < PLL_MINLIKELIHOOD && vmax2 < PLL_MINLIKELIHOOD)
            {
				t1 = _mm512_mul_pd(t1, twotothe256_MIC);
				_mm512_store_pd(&v3[0], t1);
				t2 = _mm512_mul_pd(t2, twotothe256_MIC);
				_mm512_store_pd(&v3[8], t2);

                if(!fastScaling)
                  ex3[i] += 1;
                else
                  addScale += wgt[i];
            }
        } // site loop
      }
      break;
    case PLL_INNER_INNER:
    {
      /* same as above, without pre-computations */

        // re-arrange right matrix for better memory layout
        double aLeft[64] __attribute__((align(PLL_BYTE_ALIGNMENT)));
        double aRight[64] __attribute__((align(PLL_BYTE_ALIGNMENT)));
        for(int j = 0; j < 4; j++)
        {
            for(int l = 0; l < 16; l++)
            {
                aLeft[j*16 + l] = left[l*4 + j];
                aRight[j*16 + l] = right[l*4 + j];
            }
        }

        for (int i = 0; i < n; i++)
        {
            _mm_prefetch((const char*) &x1[span*(i+8)], _MM_HINT_T1);
            _mm_prefetch((const char*) &x1[span*(i+8) + 8], _MM_HINT_T1);
            _mm_prefetch((const char*) &x2[span*(i+8)], _MM_HINT_T1);
            _mm_prefetch((const char*) &x2[span*(i+8) + 8], _MM_HINT_T1);
            _mm_prefetch((const char*) &x3[span*(i+8)], _MM_HINT_ET1);
            _mm_prefetch((const char*) &x3[span*(i+8) + 8], _MM_HINT_ET1);

            _mm_prefetch((const char*) &x1[span*(i+1)], _MM_HINT_T0);
            _mm_prefetch((const char*) &x1[span*(i+1) + 8], _MM_HINT_T0);
            _mm_prefetch((const char*) &x2[span*(i+1)], _MM_HINT_T0);
            _mm_prefetch((const char*) &x2[span*(i+1) + 8], _MM_HINT_T0);
            _mm_prefetch((const char*) &x3[span*(i+1)], _MM_HINT_ET0);
            _mm_prefetch((const char*) &x3[span*(i+1) + 8], _MM_HINT_ET0);

            double uX1[16] __attribute__((align(64)));
            double uX2[16] __attribute__((align(64)));
            double uX[16] __attribute__((align(64)));

            for(int l = 0; l < 16; l++)
            {
              uX1[l] = 0.;
              uX2[l] = 0.;
            }

            double aV1[64] __attribute__((align(64)));
            double aV2[64] __attribute__((align(64)));

            const double* v1 = &(x1[span * i]);
            const double* v2 = &(x2[span * i]);

            mic_broadcast16x64(v1, aV1);

            mic_broadcast16x64(v2, aV2);

            for(int j = 0; j < 4; j++)
            {
                #pragma ivdep
                #pragma vector aligned
                for(int l = 0; l < 16; l++)
                {
                    uX1[l] += aV1[j*16 + l] * aLeft[j*16 + l];
                    uX2[l] += aV2[j*16 + l] * aRight[j*16 + l];
                }
            }

            double* v3 =  &(x3[span * i]);

            #pragma ivdep
            #pragma vector aligned
            for(int l = 0; l < 16; ++l)
            {
                uX[l] = uX1[l] * uX2[l];
                v3[l] = 0.;
            }

            double auX[64] __attribute__((align(64)));
            mic_broadcast16x64(uX, auX);

            for(int j = 0; j < 4; ++j)
            {
                #pragma ivdep
                #pragma vector aligned
                for(int k = 0; k < 16; ++k)
                {
                    v3[k] += auX[j*16 + k] * aEV[j*16 + k];
                }
            }


            __m512d t1 = _mm512_load_pd(&v3[0]);
            t1 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t1), absMask_MIC));
            double vmax1 = _mm512_reduce_gmax_pd(t1);
            __m512d t2 = _mm512_load_pd(&v3[8]);
            t2 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t2), absMask_MIC));
            double vmax2 = _mm512_reduce_gmax_pd(t2);

            if(vmax1 < PLL_MINLIKELIHOOD && vmax2 < PLL_MINLIKELIHOOD)
            {
				t1 = _mm512_mul_pd(t1, twotothe256_MIC);
				_mm512_store_pd(&v3[0], t1);
				t2 = _mm512_mul_pd(t2, twotothe256_MIC);
				_mm512_store_pd(&v3[8], t2);

                if(!fastScaling)
                  ex3[i] += 1;
                else
                  addScale += wgt[i];
            }
        }
    } break;
    default:
//      assert(0);
      break;
  }

  /* as above, increment the global counter that counts scaling multiplications by the scaling multiplications
     carried out for computing the likelihood array at node p */

  if (fastScaling)
  {
      *scalerIncrement = addScale;
  }

}
Esempio n. 8
0
inline void store(double *r, const F64vec8 v)
{
    _mm512_store_pd(r, v);
}
Esempio n. 9
0
LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE
void stream_update_helmholtz_no_h2( const double* i_g1,
                                    const double* i_g2,
                                    const double* i_g3, 
                                    const double* i_tm1,
                                    const double* i_tm2,
                                    const double* i_tm3,
                                    double*       io_c,
                                    const double  i_h1,
                                    const int     i_length) {
  int l_n = 0;
  int l_trip_prolog = 0;
  int l_trip_stream = 0;
  
  /* init the trip counts */
  stream_init( i_length, (size_t)io_c, &l_trip_prolog, &l_trip_stream );
  
  /* run the prologue */
  for ( ; l_n < l_trip_prolog;  l_n++ ) {
    io_c[l_n] =   i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]);
  }
  /* run the bulk, hopefully using streaming stores */
#if defined(__SSE3__) && defined(__AVX__) && !defined(__AVX512F__)
  {
    const __m256d vec_h1 = _mm256_broadcast_sd(&i_h1);
    /* we need manual unrolling as the compiler otherwise generates 
       too many dependencies */
    for ( ; l_n < l_trip_stream;  l_n+=8 ) {
      __m256d vec_g1_1, vec_g2_1, vec_g3_1, vec_tm1_1, vec_tm2_1, vec_tm3_1;
      __m256d vec_g1_2, vec_g2_2, vec_g3_2, vec_tm1_2, vec_tm2_2, vec_tm3_2;

      vec_g1_1 = _mm256_loadu_pd(&(i_g1[l_n]));
      vec_tm1_1 = _mm256_loadu_pd(&(i_tm1[l_n]));
      vec_g1_2 = _mm256_loadu_pd(&(i_g1[l_n+4]));
      vec_tm1_2 = _mm256_loadu_pd(&(i_tm1[l_n+4]));

      vec_g1_1 = _mm256_mul_pd(vec_g1_1, vec_tm1_1);
      vec_g2_1 = _mm256_loadu_pd(&(i_g2[l_n]));
      vec_g1_2 = _mm256_mul_pd(vec_g1_2, vec_tm1_2);
      vec_g2_2 = _mm256_loadu_pd(&(i_g2[l_n+4]));

      vec_tm2_1 = _mm256_loadu_pd(&(i_tm2[l_n]));
      vec_g2_1 = _mm256_mul_pd(vec_g2_1, vec_tm2_1);
      vec_tm2_2 = _mm256_loadu_pd(&(i_tm2[l_n+4]));
      vec_g2_2 = _mm256_mul_pd(vec_g2_2, vec_tm2_2);

      vec_g3_1 = _mm256_loadu_pd(&(i_g3[l_n]));
      vec_tm3_1 = _mm256_loadu_pd(&(i_tm3[l_n]));
      vec_g3_2 = _mm256_loadu_pd(&(i_g3[l_n+4]));
      vec_tm3_2 = _mm256_loadu_pd(&(i_tm3[l_n+4]));

      vec_g3_1 = _mm256_mul_pd(vec_g3_1, vec_tm3_1);
      vec_g3_2 = _mm256_mul_pd(vec_g3_2, vec_tm3_2);
      vec_g1_1 = _mm256_add_pd(vec_g1_1, vec_g2_1);
      vec_g1_2 = _mm256_add_pd(vec_g1_2, vec_g2_2);

      vec_g1_1 = _mm256_add_pd(vec_g1_1, vec_g3_1);
#ifdef DISABLE_NONTEMPORAL_STORES
      _mm256_store_pd(  &(io_c[l_n]), _mm256_mul_pd(vec_g1_1, vec_h1) );
#else
      _mm256_stream_pd( &(io_c[l_n]), _mm256_mul_pd(vec_g1_1, vec_h1) );
#endif
      vec_g1_2 = _mm256_add_pd(vec_g1_2, vec_g3_2);
#ifdef DISABLE_NONTEMPORAL_STORES
      _mm256_store_pd(  &(io_c[l_n+4]), _mm256_mul_pd(vec_g1_2, vec_h1) );
#else
      _mm256_stream_pd( &(io_c[l_n+4]), _mm256_mul_pd(vec_g1_2, vec_h1) );
#endif
    }
  }
#elif defined(__SSE3__) && defined(__AVX__) && defined(__AVX512F__)
  {
    const __m512d vec_h1 = _mm512_broadcastsd_pd(_mm_load_sd(&i_h1));
    for ( ; l_n < l_trip_stream;  l_n+=8 ) {
      __m512d vec_g1, vec_g2, vec_g3, vec_tm1, vec_tm2, vec_tm3;
      vec_g1 = _mm512_loadu_pd(&(i_g1[l_n]));
      vec_tm1 = _mm512_loadu_pd(&(i_tm1[l_n]));
      vec_g1 = _mm512_mul_pd(vec_g1, vec_tm1);
      vec_g2 = _mm512_loadu_pd(&(i_g2[l_n]));
      vec_tm2 = _mm512_loadu_pd(&(i_tm2[l_n]));
      vec_g2 = _mm512_mul_pd(vec_g2, vec_tm2);
      vec_g3 = _mm512_loadu_pd(&(i_g3[l_n]));
      vec_tm3 = _mm512_loadu_pd(&(i_tm3[l_n]));
      vec_g3 = _mm512_mul_pd(vec_g3, vec_tm3);
      vec_g1 = _mm512_add_pd(vec_g1, vec_g2);
      vec_g1 = _mm512_add_pd(vec_g1, vec_g3);
#ifdef DISABLE_NONTEMPORAL_STORES
      _mm512_store_pd(  &(io_c[l_n]), _mm512_mul_pd(vec_g1, vec_h1) );
#else
      _mm512_stream_pd( &(io_c[l_n]), _mm512_mul_pd(vec_g1, vec_h1) );
#endif
    }
  }
#else
  for ( ; l_n < l_trip_stream;  l_n++ ) {
    io_c[l_n] =   i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]);
  }
#endif
  /* run the epilogue */
  for ( ; l_n < i_length;  l_n++ ) {
    io_c[l_n] =   i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]);
  }
}
Esempio n. 10
0
LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE
void stream_vector_compscale( const double* i_a,
                              const double* i_b,
                              double*       io_c,
                              const int     i_length) {
  int l_n = 0;
  int l_trip_prolog = 0;
  int l_trip_stream = 0;
  
  /* init the trip counts */
  stream_init( i_length, (size_t)io_c, &l_trip_prolog, &l_trip_stream );

  /* run the prologue */
  for ( ; l_n < l_trip_prolog;  l_n++ ) {
    io_c[l_n] = i_a[l_n]*i_b[l_n];
  }
  /* run the bulk, hopefully using streaming stores */
#if defined(__SSE3__) && defined(__AVX__) && !defined(__AVX512F__)
  {
    /* we need manual unrolling as the compiler otherwise generates 
       too many dependencies */
    for ( ; l_n < l_trip_stream;  l_n+=8 ) {
      __m256d vec_a_1, vec_b_1;
      __m256d vec_a_2, vec_b_2;

      vec_a_1 = _mm256_loadu_pd(&(i_a[l_n]));
      vec_a_2 = _mm256_loadu_pd(&(i_a[l_n+4]));
      vec_b_1 = _mm256_loadu_pd(&(i_b[l_n]));
      vec_b_2 = _mm256_loadu_pd(&(i_b[l_n+4]));

#ifdef DISABLE_NONTEMPORAL_STORES
      _mm256_store_pd(  &(io_c[l_n]),   _mm256_mul_pd( vec_a_1, vec_b_1 ) );
      _mm256_store_pd(  &(io_c[l_n+4]), _mm256_mul_pd( vec_a_2, vec_b_2 ) );
#else
      _mm256_stream_pd( &(io_c[l_n]),   _mm256_mul_pd( vec_a_1, vec_b_1 ) );
      _mm256_stream_pd( &(io_c[l_n+4]), _mm256_mul_pd( vec_a_2, vec_b_2 ) );
#endif
    }
  }
#elif defined(__SSE3__) && defined(__AVX__) && defined(__AVX512F__)
  {
    for ( ; l_n < l_trip_stream;  l_n+=8 ) {
      __m512d vec_a, vec_b;

      vec_a = _mm512_loadu_pd(&(i_a[l_n]));
      vec_b = _mm512_loadu_pd(&(i_b[l_n]));

#ifdef DISABLE_NONTEMPORAL_STORES
      _mm512_store_pd(  &(io_c[l_n]), _mm512_mul_pd( vec_a, vec_b ) );
#else
      _mm512_stream_pd( &(io_c[l_n]), _mm512_mul_pd( vec_a, vec_b ) );
#endif
    }
  }
#else
  for ( ; l_n < l_trip_stream;  l_n++ ) {
    io_c[l_n] = i_a[l_n]*i_b[l_n];
  }
#endif
  /* run the epilogue */
  for ( ; l_n < i_length;  l_n++ ) {
    io_c[l_n] = i_a[l_n]*i_b[l_n];
  }
}
 inline
 void store_aligned(double *data) const
 {
     SHORTVEC_ASSERT_ALIGNED(data, 64);
     _mm512_store_pd(data, val);
 }