示例#1
0
void
printcounters(struct counter *ctrs, uint64_t duration)
{
    struct metrics s = {0};

    s.timestamp = _rdtsc();
    s.duration = duration;
    // We skip the last core
    int corethreads =0;
    for (int cpu = 1; cpu < gbl.ncpus-3; ++cpu)
    {
        double delta[NEVENTS];
        // volatile because another thread is changing it.
        volatile struct counter *p = &ctrs[cpu];

        for (int i = 0; i < NEVENTS; ++i)
        {
            union {
                __m512d c;
                uint64_t values[8];
            } t;
            t.c = _mm512_load_pd((void *)&p->counts[i][0]);
            delta[i] = perf_scale_delta(t.values, lastctr[cpu].counts[i]);
            _mm512_storenrngo_pd((void *)&lastctr[cpu].counts[i][0], t.c);
            if (delta[i] < 0)
                delta[i] = 0;
            sevents[i] += delta[i];
        }

        if (2*delta[clocks1] > duration)
        {
            s.nthreads += 1;
            corethreads += 1;
        }

        if ((cpu % 4) == 0) // Last thread on this core
        {
            if (corethreads)
                s.ncores += 1;
            corethreads = 0;
        }

        s.vpu_ea += delta[vpu_ea];
        s.instrs += delta[instrs];
        s.vinstrs += delta[vpu_ie];
    }
    uint64_t nreads = 0, nwrites = 0;
    for (int i = 0; i < NGBOXES; ++i)
        for (int j = 0; j < 2; ++j)
        {
            nreads += pmu_rdctr(i, j, 0);
            nwrites += pmu_rdctr(i, j, 1);
        }
    s.rbytes = (nreads - prevnreads) * 64;
    s.wbytes = (nwrites - prevnwrites)* 64;
    prevnreads = nreads;
    prevnwrites = nwrites;

    sample(&s);
}
示例#2
0
int main()
{
   __m512d  t0,t1;

   double d1[8] __attribute__ ((aligned(64)));   
   double d2[8] __attribute__ ((aligned(64)));   
   double d3[8] __attribute__ ((aligned(64)));   

   for(int i=0; i<8; i++)
   {
       d1[i]= i*1.0;

       d2[i]= 0.0;

       d3[i] = d1[i];

   }

   //printf("testing intialization of registers\n");
   //_mm512_store_pd(d1,t0);
   //printf("d1=t0: %f %f %f %f %f %f %f %f  \n",d1[0],d1[1],d1[2],d1[3],d1[4],d1[5],d1[6],d1[7]);
   //_mm512_store_pd(d1,t1);
   //printf("d1=t1: %f %f %f %f %f %f %f %f  \n",d1[0],d1[1],d1[2],d1[3],d1[4],d1[5],d1[6],d1[7]);


   t0 = _mm512_load_pd(d1);

   printf("permute backward\n");
   t1 = (__m512d)  _mm512_permute4f128_epi32 ( (__m512i) t0, 0b00111001);
   _mm512_store_pd(d2,t1);
   printf("d1: %f %f %f %f %f %f %f %f  \n",d1[0],d1[1],d1[2],d1[3],d1[4],d1[5],d1[6],d1[7]);
   printf("d2: %f %f %f %f %f %f %f %f  \n",d2[0],d2[1],d2[2],d2[3],d2[4],d2[5],d2[6],d2[7]);
    
   printf("permute forward\n");
   t1 = (__m512d)  _mm512_permute4f128_epi32 ( (__m512i) t0, 0b10010011);
   _mm512_store_pd(d2,t1);
   printf("d1: %f %f %f %f %f %f %f %f  \n",d1[0],d1[1],d1[2],d1[3],d1[4],d1[5],d1[6],d1[7]);
   printf("d2: %f %f %f %f %f %f %f %f  \n",d2[0],d2[1],d2[2],d2[3],d2[4],d2[5],d2[6],d2[7]);
   


   int __attribute__((aligned(64))) order[16]={0,1,0,1,4,5,6,7,8,9,10,11,12,13,14,15};

   __m512i morder = _mm512_load_epi32(order);


   printf("permuting doubles\n");
   t1 = (__m512d) _mm512_permutevar_epi32 (morder, (__m512i) t0);
   _mm512_store_pd(d2,t1);
   printf("d1: %f %f %f %f %f %f %f %f  \n",d1[0],d1[1],d1[2],d1[3],d1[4],d1[5],d1[6],d1[7]);
   printf("d2: %f %f %f %f %f %f %f %f  \n",d2[0],d2[1],d2[2],d2[3],d2[4],d2[5],d2[6],d2[7]);

 
   return 0;

}
示例#3
0
inline void transfer_omp_loop_nontemp(uintptr_t rbuf, uintptr_t sbuf, size_t size, HMPI_Request recv_req, HMPI_Request send_req){
    int N_DOUBLES_PER_BLOCK = (64/sizeof(char)) ;
    size_t total = size / 64 ;
    int i = 0;
//#pragma vector nontemporal
#pragma omp parallel for
    for (i = 0; i < total; i++) {
        __m512d v_b = _mm512_load_pd(sbuf+ N_DOUBLES_PER_BLOCK*i);
        _mm512_storenrngo_pd(rbuf+ N_DOUBLES_PER_BLOCK*i, v_b);
    }

}
示例#4
0
void extern
avx512f_test (void)
{
  x1 = _mm512_mask_mov_pd (x1, m, x2);
  x1 = _mm512_maskz_mov_pd (m, x2);

  x1 = _mm512_load_pd (p);
  x1 = _mm512_mask_load_pd (x1, m, p);
  x1 = _mm512_maskz_load_pd (m, p);

  _mm512_store_pd (p, x1);
  _mm512_mask_store_pd (p, m, x1);
}
示例#5
0
void ks_rank_k_int_d16x14(
    int    k,
    double *a,
    double *b,
    double *c,
    int    ldc,
    aux_t  *aux
    )
{
  int    i;
  double neg2 = -2.0;


  v8df_t c007_0,  c007_1,  c007_2,  c007_3,  c007_4;
  v8df_t c007_5,  c007_6,  c007_7,  c007_8,  c007_9;
  v8df_t c007_10, c007_11, c007_12, c007_13;

  v8df_t c815_0,  c815_1,  c815_2,  c815_3,  c815_4;
  v8df_t c815_5,  c815_6,  c815_7,  c815_8,  c815_9;
  v8df_t c815_10, c815_11, c815_12, c815_13;


  v8df_t a007, a815, b_tmp;

  int k_iter = k;


  // TODO: need to clean the c buffer.




  for ( i = 0; i < k_iter; ++ i ) { 
	a007.v    = _mm512_load_pd( a );
	a815.v    = _mm512_load_pd( a + 8 );

	//printf( "%lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf\n", 
	//	a007.d[ 0 ], a007.d[ 1 ], a007.d[ 2 ],  a007.d[ 3 ], 
	//	a007.d[ 4 ], a007.d[ 5 ], a007.d[ 6 ],  a007.d[ 7 ] );

	b_tmp.v   = _mm512_extload_pd( b, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 );

    //printf( "b[ 0 ] = %lf\n", b[ 0 ] );

	//printf( "%lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf\n", 
	//	b_tmp.d[ 0 ], b_tmp.d[ 1 ], b_tmp.d[ 2 ],  b_tmp.d[ 3 ], 
	//	b_tmp.d[ 4 ], b_tmp.d[ 5 ], b_tmp.d[ 6 ],  b_tmp.d[ 7 ] );



	c007_0.v  = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_0.v ); 
	c815_0.v  = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_0.v ); 
	
	b_tmp.v   = _mm512_extload_pd( b + 1, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 );
	c007_1.v  = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_1.v ); 
	c815_1.v  = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_1.v ); 

	b_tmp.v   = _mm512_extload_pd( b + 2, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 );
	c007_2.v  = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_2.v ); 
	c815_2.v  = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_2.v ); 

	b_tmp.v   = _mm512_extload_pd( b + 3, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 );
	c007_3.v  = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_3.v ); 
	c815_3.v  = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_3.v ); 

	b_tmp.v   = _mm512_extload_pd( b + 4, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 );
	c007_4.v  = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_4.v ); 
	c815_4.v  = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_4.v ); 

	b_tmp.v   = _mm512_extload_pd( b + 5, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 );
	c007_5.v  = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_5.v ); 
	c815_5.v  = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_5.v ); 

	b_tmp.v   = _mm512_extload_pd( b + 6, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 );
	c007_6.v  = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_6.v ); 
	c815_6.v  = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_6.v ); 

	b_tmp.v   = _mm512_extload_pd( b + 7, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 );
	c007_7.v  = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_7.v ); 
	c815_7.v  = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_7.v ); 

	b_tmp.v   = _mm512_extload_pd( b + 8, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 );
	c007_8.v  = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_8.v ); 
	c815_8.v  = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_8.v ); 

	b_tmp.v   = _mm512_extload_pd( b + 9, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 );
	c007_9.v  = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_9.v ); 
	c815_9.v  = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_9.v ); 

	b_tmp.v   = _mm512_extload_pd( b + 10, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 );
	c007_10.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_10.v ); 
	c815_10.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_10.v ); 

	b_tmp.v   = _mm512_extload_pd( b + 11, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 );
	c007_11.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_11.v ); 
	c815_11.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_11.v ); 

	b_tmp.v   = _mm512_extload_pd( b + 12, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 );
	c007_12.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_12.v ); 
	c815_12.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_12.v ); 

	b_tmp.v   = _mm512_extload_pd( b + 13, _MM_UPCONV_PD_NONE, _MM_BROADCAST_1X8, 0 );
	c007_13.v = _mm512_fmadd_pd( a007.v, b_tmp.v, c007_13.v ); 
	c815_13.v = _mm512_fmadd_pd( a815.v, b_tmp.v, c815_13.v );

	a += 16;
	b += 16;
  }
 

  // simulate kernel summation
  c007_0.v  = _mm512_add_pd( c007_0.v, c007_1.v ); 
  c815_0.v  = _mm512_add_pd( c815_0.v, c815_1.v ); 

  c007_0.v  = _mm512_add_pd( c007_0.v, c007_2.v ); 
  c815_0.v  = _mm512_add_pd( c815_0.v, c815_2.v ); 

  c007_0.v  = _mm512_add_pd( c007_0.v, c007_3.v ); 
  c815_0.v  = _mm512_add_pd( c815_0.v, c815_3.v ); 

  c007_0.v  = _mm512_add_pd( c007_0.v, c007_4.v ); 
  c815_0.v  = _mm512_add_pd( c815_0.v, c815_4.v ); 

  c007_0.v  = _mm512_add_pd( c007_0.v, c007_5.v ); 
  c815_0.v  = _mm512_add_pd( c815_0.v, c815_5.v ); 

  c007_0.v  = _mm512_add_pd( c007_0.v, c007_6.v ); 
  c815_0.v  = _mm512_add_pd( c815_0.v, c815_6.v ); 

  c007_0.v  = _mm512_add_pd( c007_0.v, c007_7.v ); 
  c815_0.v  = _mm512_add_pd( c815_0.v, c815_7.v ); 

  c007_0.v  = _mm512_add_pd( c007_0.v, c007_8.v ); 
  c815_0.v  = _mm512_add_pd( c815_0.v, c815_8.v ); 

  c007_0.v  = _mm512_add_pd( c007_0.v, c007_9.v ); 
  c815_0.v  = _mm512_add_pd( c815_0.v, c815_9.v ); 

  c007_0.v  = _mm512_add_pd( c007_0.v, c007_10.v ); 
  c815_0.v  = _mm512_add_pd( c815_0.v, c815_10.v ); 

  c007_0.v  = _mm512_add_pd( c007_0.v, c007_11.v ); 
  c815_0.v  = _mm512_add_pd( c815_0.v, c815_11.v ); 

  c007_0.v  = _mm512_add_pd( c007_0.v, c007_12.v ); 
  c815_0.v  = _mm512_add_pd( c815_0.v, c815_12.v ); 

  c007_0.v  = _mm512_add_pd( c007_0.v, c007_13.v ); 
  c815_0.v  = _mm512_add_pd( c815_0.v, c815_13.v ); 










//  if ( aux->pc != 0 ) {
//
//    // packed
//    tmpc03_0.v = _mm256_load_pd( (double*)( c      ) );
//    tmpc47_0.v = _mm256_load_pd( (double*)( c + 4  ) );
//
//    tmpc03_1.v = _mm256_load_pd( (double*)( c + 8  ) );
//    tmpc47_1.v = _mm256_load_pd( (double*)( c + 12 ) );
//
//    tmpc03_2.v = _mm256_load_pd( (double*)( c + 16 ) );
//    tmpc47_2.v = _mm256_load_pd( (double*)( c + 20 ) );
//
//    tmpc03_3.v = _mm256_load_pd( (double*)( c + 24 ) );
//    tmpc47_3.v = _mm256_load_pd( (double*)( c + 28 ) );
//    
//
//    c03_0.v    = _mm256_add_pd( tmpc03_0.v, c03_0.v );
//    c47_0.v    = _mm256_add_pd( tmpc47_0.v, c47_0.v );
//
//    c03_1.v    = _mm256_add_pd( tmpc03_1.v, c03_1.v );
//    c47_1.v    = _mm256_add_pd( tmpc47_1.v, c47_1.v );
//
//    c03_2.v    = _mm256_add_pd( tmpc03_2.v, c03_2.v );
//    c47_2.v    = _mm256_add_pd( tmpc47_2.v, c47_2.v );
//
//    c03_3.v    = _mm256_add_pd( tmpc03_3.v, c03_3.v );
//    c47_3.v    = _mm256_add_pd( tmpc47_3.v, c47_3.v );
//  }
//
//
  // packed
  _mm512_store_pd( c     , c007_0.v );
  _mm512_store_pd( c + 8 , c815_0.v );

//  _mm512_store_pd( c + 16, c007_1.v );
//  _mm512_store_pd( c + 24, c815_1.v );
//
//  _mm512_store_pd( c + 32, c007_2.v );
//  _mm512_store_pd( c + 40, c815_2.v );
//
//  _mm512_store_pd( c + 48, c007_3.v );
//  _mm512_store_pd( c + 56, c815_3.v );
//
//  _mm512_store_pd( c + 64, c007_4.v );
//  _mm512_store_pd( c + 72, c815_4.v );
//
//  _mm512_store_pd( c + 80, c007_5.v );
//  _mm512_store_pd( c + 88, c815_5.v );
//
//  _mm512_store_pd( c + 96, c007_6.v );
//  _mm512_store_pd( c + 104, c815_6.v );
//
//  _mm512_store_pd( c + 112, c007_7.v );
//  _mm512_store_pd( c + 120, c815_7.v );
//
//  _mm512_store_pd( c + 128, c007_8.v );
//  _mm512_store_pd( c + 136, c815_8.v );
//
//  _mm512_store_pd( c + 144, c007_9.v );
//  _mm512_store_pd( c + 152, c815_9.v );
//
//  _mm512_store_pd( c + 160, c007_10.v );
//  _mm512_store_pd( c + 168, c815_10.v );
//
//  _mm512_store_pd( c + 176, c007_11.v );
//  _mm512_store_pd( c + 184, c815_11.v );
//
//  _mm512_store_pd( c + 192, c007_12.v );
//  _mm512_store_pd( c + 200, c815_12.v );
//
//  _mm512_store_pd( c + 208, c007_13.v );
//  _mm512_store_pd( c + 216, c815_13.v );



  //printf( "ldc = %d\n", ldc );
  

//  printf( "%lf, %lf, %lf, %lf, %lf, %lf, %lf, %lf\n", 
//	  c007_0.d[ 0 ], c007_0.d[ 1 ], c007_0.d[ 2 ],  c007_0.d[ 3 ], 
//	  c007_0.d[ 4 ], c007_0.d[ 5 ], c007_0.d[ 6 ],  c007_0.d[ 7 ] );
//

  //printf( "%lf, %lf, %lf, %lf\n", c[1], c[ ldc + 1], c[ ldc * 2 + 1], c[ ldc * 3 + 1] );
  //printf( "%lf, %lf, %lf, %lf\n", c[2], c[ ldc + 2], c[ ldc * 2 + 2], c[ ldc * 3 + 2] );
  //printf( "%lf, %lf, %lf, %lf\n", c[3], c[ ldc + 3], c[ ldc * 2 + 3], c[ ldc * 3 + 3] );
  //printf( "%lf, %lf, %lf, %lf\n", c[4], c[ ldc + 4], c[ ldc * 2 + 4], c[ ldc * 3 + 4] );
  //printf( "%lf, %lf, %lf, %lf\n", c[5], c[ ldc + 5], c[ ldc * 2 + 5], c[ ldc * 3 + 5] );
  //printf( "%lf, %lf, %lf, %lf\n", c[6], c[ ldc + 6], c[ ldc * 2 + 6], c[ ldc * 3 + 6] );
  //printf( "%lf, %lf, %lf, %lf\n", c[7], c[ ldc + 7], c[ ldc * 2 + 7], c[ ldc * 3 + 7] );
}
示例#6
0
void newviewGTRGAMMA_MIC(int tipCase,
                  double *x1, double *x2, double *x3, double *extEV, double *tipVector,
                  int *ex3, unsigned char *tipX1, unsigned char *tipX2,
                  int n, double *left, double *right, int *wgt, int *scalerIncrement, const pllBoolean fastScaling)
{
    __m512d minlikelihood_MIC = _mm512_set1_pd(PLL_MINLIKELIHOOD);
    __m512d twotothe256_MIC = _mm512_set1_pd(PLL_TWOTOTHE256);
    __m512i absMask_MIC = _mm512_set1_epi64(0x7fffffffffffffffULL);

	int addScale = 0;

    double aEV[64] __attribute__((align(PLL_BYTE_ALIGNMENT)));

    #pragma ivdep
    for (int l = 0; l < 64; ++l)
    {
        aEV[l] = extEV[(l / 16) * 4 + (l % 4)];
    }

  switch(tipCase)
  {
    case PLL_TIP_TIP:
      {
        /* multiply all possible tip state vectors with the respective P-matrices
        */

            double umpX1[256] __attribute__((align(PLL_BYTE_ALIGNMENT)));
            double umpX2[256] __attribute__((align(PLL_BYTE_ALIGNMENT)));

            for(int k = 0; k < 256; ++k)
            {
                umpX1[k] = 0.0;
                umpX2[k] = 0.0;
            }

            for(int i = 0; i < maxStateValue; ++i)
            {
              for(int l = 0; l < states; ++l)
              {
                  #pragma ivdep
                  for(int k = 0; k < span; ++k)
                  {
                      umpX1[16 * i + k] +=  tipVector[i * 4 + l] *  left[k * 4 + l];
                      umpX2[16 * i + k] +=  tipVector[i * 4 + l] * right[k * 4 + l];
                  }
              }
            }

        double auX[64] __attribute__((align(64)));

        for(int i = 0; i < n; ++i)
        {
            _mm_prefetch((const char*) (const char*) &x3[span*(i+8)], _MM_HINT_ET1);
            _mm_prefetch((const char*) &x3[span*(i+8) + 8], _MM_HINT_ET1);

            _mm_prefetch((const char*) &x3[span*(i+1)], _MM_HINT_ET0);
            _mm_prefetch((const char*) &x3[span*(i+1) + 8], _MM_HINT_ET0);

            const double *uX1 = &umpX1[16 * tipX1[i]];
            const double *uX2 = &umpX2[16 * tipX2[i]];

            double uX[16] __attribute__((align(PLL_BYTE_ALIGNMENT)));
            double* v = &x3[i * 16];

            #pragma ivdep
            #pragma vector aligned
            for(int l = 0; l < 16; ++l)
            {
                uX[l] = uX1[l] * uX2[l];
                v[l] = 0.;
            }

            mic_broadcast16x64(uX, auX);

            for (int j = 0; j < 4; ++j)
            {
                #pragma ivdep
                #pragma vector aligned
                #pragma vector nontemporal
                for(int k = 0; k < 16; ++k)
                {
                    v[k] += auX[j*16 + k] * aEV[j*16 + k];
                }
            }

            // init scaling counter for the site
            if (!fastScaling)
                ex3[i] = 0;

        } // sites loop

      }
      break;
    case PLL_TIP_INNER:
      {
        /* we do analogous pre-computations as above, with the only difference that we now do them
        only for one tip vector */

          double umpX1[256] __attribute__((align(PLL_BYTE_ALIGNMENT)));

        /* precompute P and left tip vector product */

        for(int k = 0; k < 256; ++k)
        {
            umpX1[k] = 0.0;
        }

        for(int i = 0; i < 16; ++i)
        {
          for(int l = 0; l < 4; ++l)
          {
              #pragma ivdep
              for(int k = 0; k < 16; ++k)
              {
                  umpX1[16 * i + k] +=  tipVector[i * 4 + l] *  left[k * 4 + l];
              }
          }
        }

        // re-arrange right matrix for better memory layout
        double aRight[64] __attribute__((align(PLL_BYTE_ALIGNMENT)));
        for(int j = 0; j < 4; j++)
        {
            for(int l = 0; l < 16; l++)
            {
                aRight[j*16 + l] = right[l*4 + j];
            }
        }

        for (int i = 0; i < n; i++)
        {
            _mm_prefetch((const char*) &x2[span*(i+16)], _MM_HINT_T1);
            _mm_prefetch((const char*) &x2[span*(i+16) + 8], _MM_HINT_T1);
            _mm_prefetch((const char*) &x3[span*(i+16)], _MM_HINT_ET1);
            _mm_prefetch((const char*) &x3[span*(i+16) + 8], _MM_HINT_ET1);

            _mm_prefetch((const char*) &x2[span*(i+1)], _MM_HINT_T0);
            _mm_prefetch((const char*) &x2[span*(i+1) + 8], _MM_HINT_T0);
            _mm_prefetch((const char*) &x3[span*(i+1)], _MM_HINT_ET0);
            _mm_prefetch((const char*) &x3[span*(i+1) + 8], _MM_HINT_ET0);

            /* access pre-computed value based on the raw sequence data tipX1 that is used as an index */
            double* uX1 = &umpX1[span * tipX1[i]];
            double uX2[16] __attribute__((align(PLL_BYTE_ALIGNMENT)));
            double uX[16] __attribute__((align(PLL_BYTE_ALIGNMENT)));

            #pragma vector aligned
            for(int l = 0; l < 16; ++l)
            {
                uX2[l] = 0.;
            }

            double aV2[64] __attribute__((align(PLL_BYTE_ALIGNMENT)));
            const double* v2 = &(x2[16 * i]);

            mic_broadcast16x64(v2, aV2);

            for(int j = 0; j < 4; j++)
            {
                #pragma ivdep
                #pragma vector aligned
                for(int l = 0; l < 16; l++)
                {
                    uX2[l] += aV2[j*16 + l] * aRight[j*16 + l];
                }
            }

            double* v3 = &(x3[span * i]);

            #pragma ivdep
            #pragma vector aligned
            for(int l = 0; l < 16; ++l)
            {
                uX[l] = uX1[l] * uX2[l];
                v3[l] = 0.;
            }

            double auX[64] __attribute__((align(PLL_BYTE_ALIGNMENT)));
            mic_broadcast16x64(uX, auX);

            for (int j = 0; j < 4; ++j)
            {
                #pragma ivdep
                #pragma vector aligned
                for(int k = 0; k < 16; ++k)
                {
                    v3[k] += auX[j*16 + k] * aEV[j*16 + k];
                }
            }

            __m512d t1 = _mm512_load_pd(&v3[0]);
            t1 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t1), absMask_MIC));
            double vmax1 = _mm512_reduce_gmax_pd(t1);
            __m512d t2 = _mm512_load_pd(&v3[8]);
            t2 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t2), absMask_MIC));
            double vmax2 = _mm512_reduce_gmax_pd(t2);

            if(vmax1 < PLL_MINLIKELIHOOD && vmax2 < PLL_MINLIKELIHOOD)
            {
				t1 = _mm512_mul_pd(t1, twotothe256_MIC);
				_mm512_store_pd(&v3[0], t1);
				t2 = _mm512_mul_pd(t2, twotothe256_MIC);
				_mm512_store_pd(&v3[8], t2);

                if(!fastScaling)
                  ex3[i] += 1;
                else
                  addScale += wgt[i];
            }
        } // site loop
      }
      break;
    case PLL_INNER_INNER:
    {
      /* same as above, without pre-computations */

        // re-arrange right matrix for better memory layout
        double aLeft[64] __attribute__((align(PLL_BYTE_ALIGNMENT)));
        double aRight[64] __attribute__((align(PLL_BYTE_ALIGNMENT)));
        for(int j = 0; j < 4; j++)
        {
            for(int l = 0; l < 16; l++)
            {
                aLeft[j*16 + l] = left[l*4 + j];
                aRight[j*16 + l] = right[l*4 + j];
            }
        }

        for (int i = 0; i < n; i++)
        {
            _mm_prefetch((const char*) &x1[span*(i+8)], _MM_HINT_T1);
            _mm_prefetch((const char*) &x1[span*(i+8) + 8], _MM_HINT_T1);
            _mm_prefetch((const char*) &x2[span*(i+8)], _MM_HINT_T1);
            _mm_prefetch((const char*) &x2[span*(i+8) + 8], _MM_HINT_T1);
            _mm_prefetch((const char*) &x3[span*(i+8)], _MM_HINT_ET1);
            _mm_prefetch((const char*) &x3[span*(i+8) + 8], _MM_HINT_ET1);

            _mm_prefetch((const char*) &x1[span*(i+1)], _MM_HINT_T0);
            _mm_prefetch((const char*) &x1[span*(i+1) + 8], _MM_HINT_T0);
            _mm_prefetch((const char*) &x2[span*(i+1)], _MM_HINT_T0);
            _mm_prefetch((const char*) &x2[span*(i+1) + 8], _MM_HINT_T0);
            _mm_prefetch((const char*) &x3[span*(i+1)], _MM_HINT_ET0);
            _mm_prefetch((const char*) &x3[span*(i+1) + 8], _MM_HINT_ET0);

            double uX1[16] __attribute__((align(64)));
            double uX2[16] __attribute__((align(64)));
            double uX[16] __attribute__((align(64)));

            for(int l = 0; l < 16; l++)
            {
              uX1[l] = 0.;
              uX2[l] = 0.;
            }

            double aV1[64] __attribute__((align(64)));
            double aV2[64] __attribute__((align(64)));

            const double* v1 = &(x1[span * i]);
            const double* v2 = &(x2[span * i]);

            mic_broadcast16x64(v1, aV1);

            mic_broadcast16x64(v2, aV2);

            for(int j = 0; j < 4; j++)
            {
                #pragma ivdep
                #pragma vector aligned
                for(int l = 0; l < 16; l++)
                {
                    uX1[l] += aV1[j*16 + l] * aLeft[j*16 + l];
                    uX2[l] += aV2[j*16 + l] * aRight[j*16 + l];
                }
            }

            double* v3 =  &(x3[span * i]);

            #pragma ivdep
            #pragma vector aligned
            for(int l = 0; l < 16; ++l)
            {
                uX[l] = uX1[l] * uX2[l];
                v3[l] = 0.;
            }

            double auX[64] __attribute__((align(64)));
            mic_broadcast16x64(uX, auX);

            for(int j = 0; j < 4; ++j)
            {
                #pragma ivdep
                #pragma vector aligned
                for(int k = 0; k < 16; ++k)
                {
                    v3[k] += auX[j*16 + k] * aEV[j*16 + k];
                }
            }


            __m512d t1 = _mm512_load_pd(&v3[0]);
            t1 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t1), absMask_MIC));
            double vmax1 = _mm512_reduce_gmax_pd(t1);
            __m512d t2 = _mm512_load_pd(&v3[8]);
            t2 = _mm512_castsi512_pd(_mm512_and_epi64(_mm512_castpd_si512(t2), absMask_MIC));
            double vmax2 = _mm512_reduce_gmax_pd(t2);

            if(vmax1 < PLL_MINLIKELIHOOD && vmax2 < PLL_MINLIKELIHOOD)
            {
				t1 = _mm512_mul_pd(t1, twotothe256_MIC);
				_mm512_store_pd(&v3[0], t1);
				t2 = _mm512_mul_pd(t2, twotothe256_MIC);
				_mm512_store_pd(&v3[8], t2);

                if(!fastScaling)
                  ex3[i] += 1;
                else
                  addScale += wgt[i];
            }
        }
    } break;
    default:
//      assert(0);
      break;
  }

  /* as above, increment the global counter that counts scaling multiplications by the scaling multiplications
     carried out for computing the likelihood array at node p */

  if (fastScaling)
  {
      *scalerIncrement = addScale;
  }

}
示例#7
0
文件: arch.hpp 项目: huoyao/Hydro
inline F64vec8 load(const double *r)
{
    return _mm512_load_pd(r);
}
 inline
 void load_aligned(const double *data)
 {
     SHORTVEC_ASSERT_ALIGNED(data, 64);
     val = _mm512_load_pd(data);
 }