__m256d
test_mm256_broadcast_sd(double const *__a) {
  // CHECK-LABEL: @test_mm256_broadcast_sd
  // CHECK: insertelement <4 x double> {{.*}}, i32 0
  // CHECK: insertelement <4 x double> {{.*}}, i32 1
  // CHECK: insertelement <4 x double> {{.*}}, i32 2
  // CHECK: insertelement <4 x double> {{.*}}, i32 3
  return _mm256_broadcast_sd(__a);
}
static inline void matmul_4xkxkx4(int lda, int K, double* a, double* b, double* c)
{
  __m256d a_coli, bi0, bi1, bi2, bi3;
  __m256d c_col0, c_col1, c_col2, c_col3;

  /* layout of 4x4 c matrix
      00 01 02 03
      10 11 12 13
      20 21 22 23
      30 31 32 33
  */
  double* c01_ptr = c + lda;
  double* c02_ptr = c01_ptr + lda;
  double* c03_ptr = c02_ptr + lda;

  // load old value of c
  c_col0 = _mm256_loadu_pd(c);
  c_col1 = _mm256_loadu_pd(c01_ptr);
  c_col2 = _mm256_loadu_pd(c02_ptr);
  c_col3 = _mm256_loadu_pd(c03_ptr);

  // for every column of a (or every row of b)
  for (int i = 0; i < K; ++i) 
  {
    a_coli = _mm256_load_pd(a);
    a += 4;

    bi0 = _mm256_broadcast_sd(b++);
    bi1 = _mm256_broadcast_sd(b++);
    bi2 = _mm256_broadcast_sd(b++);
    bi3 = _mm256_broadcast_sd(b++);

    c_col0 = _mm256_add_pd(c_col0, _mm256_mul_pd(a_coli, bi0));
    c_col1 = _mm256_add_pd(c_col1, _mm256_mul_pd(a_coli, bi1));
    c_col2 = _mm256_add_pd(c_col2, _mm256_mul_pd(a_coli, bi2));
    c_col3 = _mm256_add_pd(c_col3, _mm256_mul_pd(a_coli, bi3));
  }

  _mm256_storeu_pd(c, c_col0);
  _mm256_storeu_pd(c01_ptr, c_col1);
  _mm256_storeu_pd(c02_ptr, c_col2);
  _mm256_storeu_pd(c03_ptr, c_col3);
}
LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE
void stream_vector_set( const double i_scalar,
                        double*       io_c,
                        const int     i_length) {
  int l_n = 0;
  int l_trip_prolog = 0;
  int l_trip_stream = 0;
  
  /* init the trip counts */
  stream_init( i_length, (size_t)io_c, &l_trip_prolog, &l_trip_stream );

  /* run the prologue */
  for ( ; l_n < l_trip_prolog;  l_n++ ) {
    io_c[l_n] = i_scalar;
  }
  /* run the bulk, hopefully using streaming stores */
#if defined(__SSE3__) && defined(__AVX__) && !defined(__AVX512F__)
  {
    /* we need manual unrolling as the compiler otherwise generates 
       too many dependencies */
    const __m256d vec_scalar = _mm256_broadcast_sd(&i_scalar);
    for ( ; l_n < l_trip_stream;  l_n+=8 ) {
#ifdef DISABLE_NONTEMPORAL_STORES
      _mm256_store_pd(  &(io_c[l_n]),   vec_scalar );
      _mm256_store_pd(  &(io_c[l_n+4]), vec_scalar );
#else
      _mm256_stream_pd( &(io_c[l_n]),   vec_scalar );
      _mm256_stream_pd( &(io_c[l_n+4]), vec_scalar );
#endif
    }
  }
#elif defined(__SSE3__) && defined(__AVX__) && defined(__AVX512F__)
  {
    const __m512d vec_scalar = _mm512_broadcastsd_pd(_mm_load_sd(&i_scalar));
    for ( ; l_n < l_trip_stream;  l_n+=8 ) {
#ifdef DISABLE_NONTEMPORAL_STORES
      _mm512_store_pd(  &(io_c[l_n]), vec_scalar );
#else
      _mm512_stream_pd( &(io_c[l_n]), vec_scalar );
#endif
    }
  }
#else
  for ( ; l_n < l_trip_stream;  l_n++ ) {
    io_c[l_n] = i_scalar;
  }
#endif
  /* run the epilogue */
  for ( ; l_n < i_length;  l_n++ ) {
    io_c[l_n] = i_scalar;
  }
}
Exemple #4
0
void static
avx_test (void)
{
  int i;
  double s = 39678;
  union256d u;
  double e [4];

  u.x = _mm256_broadcast_sd (&s);

  for (i = 0; i < 4; i++)
    e[i] = s;

  if (check_union256d (u, e))
    abort ();
}
Exemple #5
0
void gaussian_int_d8x6(
    int    k,
    int    rhs,
    //double *h,
    double *u,
    double *aa,
    double *a,
    double *bb,
    double *b,
    double *w,
    double *c,
    ks_t   *ker,
    aux_t  *aux
    )
{
  int    i;
  double alpha = ker->scal;
  // 16 registers.
  v4df_t c03_0, c03_1, c03_2, c03_3, c03_4, c03_5;
  v4df_t c47_0, c47_1, c47_2, c47_3, c47_4, c47_5;
  v4df_t a03, a47, b0, b1;

  #include <rank_k_int_d8x6.h>
  #include <sq2nrm_int_d8x6.h>

  // Scale before the kernel evaluation
  a03.v   = _mm256_broadcast_sd( &alpha );
  c03_0.v = _mm256_mul_pd( a03.v, c03_0.v );
  c03_1.v = _mm256_mul_pd( a03.v, c03_1.v );
  c03_2.v = _mm256_mul_pd( a03.v, c03_2.v );
  c03_3.v = _mm256_mul_pd( a03.v, c03_3.v );
  c03_4.v = _mm256_mul_pd( a03.v, c03_4.v );
  c03_5.v = _mm256_mul_pd( a03.v, c03_5.v );

  c47_0.v = _mm256_mul_pd( a03.v, c47_0.v );
  c47_1.v = _mm256_mul_pd( a03.v, c47_1.v );
  c47_2.v = _mm256_mul_pd( a03.v, c47_2.v );
  c47_3.v = _mm256_mul_pd( a03.v, c47_3.v );
  c47_4.v = _mm256_mul_pd( a03.v, c47_4.v );
  c47_5.v = _mm256_mul_pd( a03.v, c47_5.v );

  // Prefetch u, w
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( u ) );
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( w ) );

  // c = exp( c )
  c03_0.v = _mm256_exp_pd( c03_0.v );
  c03_1.v = _mm256_exp_pd( c03_1.v );
  c03_2.v = _mm256_exp_pd( c03_2.v );
  c03_3.v = _mm256_exp_pd( c03_3.v );
  c03_4.v = _mm256_exp_pd( c03_4.v );
  c03_5.v = _mm256_exp_pd( c03_5.v );

  c47_0.v = _mm256_exp_pd( c47_0.v );
  c47_1.v = _mm256_exp_pd( c47_1.v );
  c47_2.v = _mm256_exp_pd( c47_2.v );
  c47_3.v = _mm256_exp_pd( c47_3.v );
  c47_4.v = _mm256_exp_pd( c47_4.v );
  c47_5.v = _mm256_exp_pd( c47_5.v );

  // Preload u03, u47
  a03.v    = _mm256_load_pd( (double*)  u       );
  a47.v    = _mm256_load_pd( (double*)( u + 4 ) );

  // Multiple rhs weighted sum.
  #include<weighted_sum_int_d8x6.h>

  //if ( u[ 0 ] != u[ 0 ] ) printf( "u[ 0 ] nan\n" );
  //if ( u[ 1 ] != u[ 1 ] ) printf( "u[ 1 ] nan\n" );
  //if ( u[ 2 ] != u[ 2 ] ) printf( "u[ 2 ] nan\n" );
  //if ( u[ 3 ] != u[ 3 ] ) printf( "u[ 3 ] nan\n" );
  //if ( u[ 4 ] != u[ 4 ] ) printf( "u[ 4 ] nan\n" );
  //if ( u[ 5 ] != u[ 5 ] ) printf( "u[ 5 ] nan\n" );
  //if ( u[ 6 ] != u[ 6 ] ) printf( "u[ 6 ] nan\n" );
  //if ( u[ 7 ] != u[ 7 ] ) printf( "u[ 7 ] nan\n" );

  //if ( w[ 0 ] != w[ 0 ] ) printf( "w[ 0 ] nan\n" );
  //if ( w[ 1 ] != w[ 1 ] ) printf( "w[ 1 ] nan\n" );
  //if ( w[ 2 ] != w[ 2 ] ) printf( "w[ 2 ] nan\n" );
  //if ( w[ 3 ] != w[ 3 ] ) printf( "w[ 3 ] nan\n" );
  //if ( w[ 4 ] != w[ 4 ] ) printf( "w[ 4 ] nan\n" );
  //if ( w[ 5 ] != w[ 5 ] ) printf( "w[ 5 ] nan\n" );
}
Exemple #6
0
ALGEBRA_INLINE void		vector_addm_double_aligned_32 (double* v1,double lambda,const double* v2,size_t n)
{
	size_t k;
	
	__m256d l1 = _mm256_broadcast_sd(&lambda);
	__m256d l2 = _mm256_broadcast_sd(&lambda);
	__m256d l3 = _mm256_broadcast_sd(&lambda);
	__m256d l4 = _mm256_broadcast_sd(&lambda);

	size_t q = n / 16;
	size_t r = n % 16;
	if(q > 0) {
		if (ALGEBRA_IS_ALIGNED(v1) && ALGEBRA_IS_ALIGNED(v2)) {
			for (k=0;k<q;k++) {
				/* Charge 4 valeurs de chaque tableau */
				__m256d i1 = _mm256_load_pd(v1);
				__m256d j1 = _mm256_load_pd(v2);
				__m256d i2 = _mm256_load_pd(v1+4);
				__m256d j2 = _mm256_load_pd(v2+4);
				__m256d i3 = _mm256_load_pd(v1+8);
				__m256d j3 = _mm256_load_pd(v2+8);
				__m256d i4 = _mm256_load_pd(v1+12);
				__m256d j4 = _mm256_load_pd(v2+12);
				/* multiplie */
					   j1 = _mm256_mul_pd(j1, l1);
					   j2 = _mm256_mul_pd(j2, l2);
					   j3 = _mm256_mul_pd(j3, l3);
					   j4 = _mm256_mul_pd(j4, l4);
				/* Additionne */
				i1 = _mm256_add_pd(i1,j1);
				i2 = _mm256_add_pd(i2,j2);
				i3 = _mm256_add_pd(i3,j3);
				i4 = _mm256_add_pd(i4,j4);
				/* Sauvegarde */
				_mm256_store_pd(v1, i1);
				_mm256_store_pd(v1+4, i2);
				_mm256_store_pd(v1+8, i3);
				_mm256_store_pd(v1+12, i4);
				v1 += 16;
				v2 += 16;
			}
		}
		else {		
			for (k=0;k<q;k++) {
				/* Charge 4 valeurs de chaque tableau */
				__m256d i1 = _mm256_loadu_pd(v1);
				__m256d j1 = _mm256_loadu_pd(v2);
				__m256d i2 = _mm256_loadu_pd(v1+4);
				__m256d j2 = _mm256_loadu_pd(v2+4);
				__m256d i3 = _mm256_loadu_pd(v1+8);
				__m256d j3 = _mm256_loadu_pd(v2+8);
				__m256d i4 = _mm256_loadu_pd(v1+12);
				__m256d j4 = _mm256_loadu_pd(v2+12);
				/* multiplie */
					   j1 = _mm256_mul_pd(j1, l1);
					   j2 = _mm256_mul_pd(j2, l2);
					   j3 = _mm256_mul_pd(j3, l3);
					   j4 = _mm256_mul_pd(j4, l4);
				/* Additionne */
				i1 = _mm256_add_pd(i1,j1);
				i2 = _mm256_add_pd(i2,j2);
				i3 = _mm256_add_pd(i3,j3);
				i4 = _mm256_add_pd(i4,j4);
				/* Sauvegarde */
				_mm256_storeu_pd(v1, i1);
				_mm256_storeu_pd(v1+4, i2);
				_mm256_storeu_pd(v1+8, i3);
				_mm256_storeu_pd(v1+12, i4);
				v1 += 16;
				v2 += 16;
			}
		}
	}
	
	for(k = 0 ; k<r ; k++)
		v1[k] += lambda*v2[k];
}
// it moves horizontally inside a block (A upper triangular)
void kernel_dtrmv_u_n_4_lib4(int kmax, double *A, double *x, double *y, int alg)
	{

	if(kmax<=0) 
		return;
	
	const int lda = 4;
	
	int k;
	
	__m128d
		tmp0,
		z_0, y_0_1, a_00_10;

	__m256d
		zeros,
		ax_temp,
		a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33,
		x_0, x_1, x_2, x_3,
		y_0_1_2_3, y_0_1_2_3_b, y_0_1_2_3_c, y_0_1_2_3_d, z_0_1_2_3;
		
	zeros = _mm256_setzero_pd();
	
/*	y_0_1_2_3   = _mm256_setzero_pd();	*/
/*	y_0_1_2_3_b = _mm256_setzero_pd();	*/
/*	y_0_1_2_3_c = _mm256_setzero_pd();	*/
	y_0_1_2_3_d = _mm256_setzero_pd();
	
	// second col (avoid zero y_0_1)
	z_0     = _mm_loaddup_pd( &x[1] );
	a_00_10 = _mm_load_pd( &A[0+lda*1] );
	y_0_1   = _mm_mul_pd( a_00_10, z_0 );

	// first col
	z_0     = _mm_load_sd( &x[0] );
	a_00_10 = _mm_load_sd( &A[0+lda*0] );
	tmp0    = _mm_mul_sd( a_00_10, z_0 );
	y_0_1   = _mm_add_sd( y_0_1, tmp0 );
	y_0_1_2_3_c = _mm256_castpd128_pd256( y_0_1 );
	y_0_1_2_3_c = _mm256_blend_pd( y_0_1_2_3_c, y_0_1_2_3_d, 0xc );

	// forth col (avoid zero y_0_1_2_3)
	x_3     = _mm256_broadcast_sd( &x[3] );
	a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );
	y_0_1_2_3 = _mm256_mul_pd( a_03_13_23_33, x_3 );

	// first col
	x_2     = _mm256_broadcast_sd( &x[2] );
	x_2     = _mm256_blend_pd( x_2, zeros, 0x8 );
	a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
	y_0_1_2_3_b = _mm256_mul_pd( a_02_12_22_32, x_2 );

	A += 4*lda;
	x += 4;

	k=4;
	for(; k<kmax-3; k+=4)
		{

		x_0 = _mm256_broadcast_sd( &x[0] );
		x_1 = _mm256_broadcast_sd( &x[1] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );

		x_2 = _mm256_broadcast_sd( &x[2] );
		x_3 = _mm256_broadcast_sd( &x[3] );

		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
		y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );

		ax_temp = _mm256_mul_pd( a_02_12_22_32, x_2 );
		y_0_1_2_3_c = _mm256_add_pd( y_0_1_2_3_c, ax_temp );
		ax_temp = _mm256_mul_pd( a_03_13_23_33, x_3 );
		y_0_1_2_3_d = _mm256_add_pd( y_0_1_2_3_d, ax_temp );
		
		A += 4*lda;
		x += 4;

		}
	
	y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_c );
	y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, y_0_1_2_3_d );

	if(kmax%4>=2)
		{

		x_0 = _mm256_broadcast_sd( &x[0] );
		x_1 = _mm256_broadcast_sd( &x[1] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
		y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );

		A += 2*lda;
		x += 2;

		}

	y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_b );

	if(kmax%2==1)
		{

		x_0 = _mm256_broadcast_sd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		
		}

	if(alg==0)
		{
		_mm256_storeu_pd(&y[0], y_0_1_2_3);
		}
	else if(alg==1)
		{
		z_0_1_2_3 = _mm256_loadu_pd( &y[0] );

		z_0_1_2_3 = _mm256_add_pd ( z_0_1_2_3, y_0_1_2_3 );

		_mm256_storeu_pd(&y[0], z_0_1_2_3);
		}
	else // alg==-1
		{
		z_0_1_2_3 = _mm256_loadu_pd( &y[0] );

		z_0_1_2_3 = _mm256_sub_pd ( z_0_1_2_3, y_0_1_2_3 );

		_mm256_storeu_pd(&y[0], z_0_1_2_3);
		}

	}
// it moves horizontally inside a block
void kernel_dtrmv_u_n_8_lib4(int kmax, double *A0, int sda, double *x, double *y, int alg)
	{

	if(kmax<=0) 
		return;
	
	double *A1 = A0 + 4*sda;

	const int lda = 4;
	
	int k;

	__m128d
		tmp0,
		z_0, y_0_1, a_00_10;

	__m256d
		zeros,
		ax_temp,
		a_00_10_20_30, a_01_11_21_31,
		a_40_50_60_70, a_41_51_61_71,
		x_0, x_1,
		y_0_1_2_3, y_0_1_2_3_b, z_0_1_2_3,
		y_4_5_6_7, y_4_5_6_7_b, z_4_5_6_7;
	
/*	y_0_1_2_3   = _mm256_setzero_pd();	*/
/*	y_4_5_6_7   = _mm256_setzero_pd();	*/
/*	y_0_1_2_3_b = _mm256_setzero_pd();	*/
/*	y_4_5_6_7_b = _mm256_setzero_pd();	*/
		
	zeros = _mm256_setzero_pd();
	
/*	y_0_1_2_3   = _mm256_setzero_pd();	*/
/*	y_0_1_2_3_b = _mm256_setzero_pd();	*/
/*	y_0_1_2_3_c = _mm256_setzero_pd();	*/
/*	y_0_1_2_3_d = _mm256_setzero_pd();*/
	
	// upper triangular

	// second col (avoid zero y_0_1)
	z_0     = _mm_loaddup_pd( &x[1] );
	a_00_10 = _mm_load_pd( &A0[0+lda*1] );
	y_0_1   = _mm_mul_pd( a_00_10, z_0 );

	// first col
	z_0     = _mm_load_sd( &x[0] );
	a_00_10 = _mm_load_sd( &A0[0+lda*0] );
	tmp0    = _mm_mul_sd( a_00_10, z_0 );
	y_0_1   = _mm_add_sd( y_0_1, tmp0 );
	y_0_1_2_3_b = _mm256_castpd128_pd256( y_0_1 );
	y_0_1_2_3_b = _mm256_blend_pd( y_0_1_2_3_b, y_0_1_2_3_b, 0xc );

	// forth col (avoid zero y_0_1_2_3)
	x_1     = _mm256_broadcast_sd( &x[3] );
	a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] );
	y_0_1_2_3 = _mm256_mul_pd( a_01_11_21_31, x_1 );

	// first col
	x_0     = _mm256_broadcast_sd( &x[2] );
	x_0     = _mm256_blend_pd( x_0, zeros, 0x8 );
	a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] );
	ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
	y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );


	A0 += 4*lda;
	A1 += 4*lda;
	x  += 4;


	// upper squared
	x_0 = _mm256_broadcast_sd( &x[0] );
	x_1 = _mm256_broadcast_sd( &x[1] );

	a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] );
	a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] );

	ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
	y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
	ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
	y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );

	x_0 = _mm256_broadcast_sd( &x[2] );
	x_1 = _mm256_broadcast_sd( &x[3] );

	a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] );
	a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] );

	ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
	y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
	ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
	y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );


	// lower triangular


	// second col (avoid zero y_0_1)
	z_0     = _mm_loaddup_pd( &x[1] );
	a_00_10 = _mm_load_pd( &A1[0+lda*1] );
	y_0_1   = _mm_mul_pd( a_00_10, z_0 );

	// first col
	z_0     = _mm_load_sd( &x[0] );
	a_00_10 = _mm_load_sd( &A1[0+lda*0] );
	tmp0    = _mm_mul_sd( a_00_10, z_0 );
	y_0_1   = _mm_add_sd( y_0_1, tmp0 );
	y_4_5_6_7_b = _mm256_castpd128_pd256( y_0_1 );
	y_4_5_6_7_b = _mm256_blend_pd( y_4_5_6_7_b, y_4_5_6_7_b, 0xc );

	// forth col (avoid zero y_4_5_6_7)
	x_1     = _mm256_broadcast_sd( &x[3] );
	a_01_11_21_31 = _mm256_load_pd( &A1[0+lda*3] );
	y_4_5_6_7 = _mm256_mul_pd( a_01_11_21_31, x_1 );

	// first col
	x_0     = _mm256_broadcast_sd( &x[2] );
	x_0     = _mm256_blend_pd( x_0, zeros, 0x8 );
	a_00_10_20_30 = _mm256_load_pd( &A1[0+lda*2] );
	ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
	y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp );


	A0 += 4*lda;
	A1 += 4*lda;
	x  += 4;


	k=8;
	for(; k<kmax-3; k+=4)
		{

/*		__builtin_prefetch( A0 + 4*lda );*/
/*		__builtin_prefetch( A1 + 4*lda );*/

		x_0 = _mm256_broadcast_sd( &x[0] );
		x_1 = _mm256_broadcast_sd( &x[1] );

		a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] );
		a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] );
		a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*1] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 );
		y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp );
		ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
		y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );
		ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 );
		y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp );

/*		__builtin_prefetch( A0 + 5*lda );*/
/*		__builtin_prefetch( A1 + 5*lda );*/

		x_0 = _mm256_broadcast_sd( &x[2] );
		x_1 = _mm256_broadcast_sd( &x[3] );

		a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] );
		a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*2] );
		a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] );
		a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*3] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 );
		y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp );
		ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
		y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );
		ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 );
		y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp );
	
		A0 += 4*lda;
		A1 += 4*lda;
		x  += 4;

		}
		
	if(kmax%4>=2)
		{

		x_0 = _mm256_broadcast_sd( &x[0] );
		x_1 = _mm256_broadcast_sd( &x[1] );

		a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] );
		a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] );
		a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*1] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 );
		y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp );
		ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
		y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );
		ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 );
		y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp );
		
		A0 += 2*lda;
		A1 += 2*lda;
		x  += 2;

		}
	
	y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_b );
	y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, y_4_5_6_7_b );

	if(kmax%2==1)
		{

		x_0 = _mm256_broadcast_sd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] );
		a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 );
		y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp );
		
/*		A0 += 1*lda;*/
/*		A1 += 1*lda;*/
/*		x  += 1;*/

		}

	if(alg==0)
		{
		_mm256_storeu_pd(&y[0], y_0_1_2_3);
		_mm256_storeu_pd(&y[4], y_4_5_6_7);
		}
	else if(alg==1)
		{
		z_0_1_2_3 = _mm256_loadu_pd( &y[0] );
		z_4_5_6_7 = _mm256_loadu_pd( &y[4] );

		z_0_1_2_3 = _mm256_add_pd( z_0_1_2_3, y_0_1_2_3 );
		z_4_5_6_7 = _mm256_add_pd( z_4_5_6_7, y_4_5_6_7 );

		_mm256_storeu_pd(&y[0], z_0_1_2_3);
		_mm256_storeu_pd(&y[4], z_4_5_6_7);
		}
	else // alg==-1
		{
		z_0_1_2_3 = _mm256_loadu_pd( &y[0] );
		z_4_5_6_7 = _mm256_loadu_pd( &y[4] );

		z_0_1_2_3 = _mm256_sub_pd( z_0_1_2_3, y_0_1_2_3 );
		z_4_5_6_7 = _mm256_sub_pd( z_4_5_6_7, y_4_5_6_7 );

		_mm256_storeu_pd(&y[0], z_0_1_2_3);
		_mm256_storeu_pd(&y[4], z_4_5_6_7);
		}

	}
// it moves vertically across blocks
void kernel_dsymv_4_lib4(int kmax, double *A, int sda, double *x_n, double *y_n, double *z_n, double *x_t, double *y_t, double *z_t, int tri, int alg)
	{
	
	if(kmax<=0) 
		return;

/*printf("\nciao %d\n", kmax);	*/
	const int bs = 4;
	
	__builtin_prefetch( A + bs*0 );
	__builtin_prefetch( A + bs*2 );

	int k, ka;
	ka = kmax; // number from aligned positon

	double k_left;
	
//	double *sA, *sy_n, *sx_t;

	static double d_mask[4]  = {0.5, 1.5, 2.5, 3.5};

	__m256d
		v_mask,
		zeros, temp,
		a_00, a_01, a_02, a_03,
		x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
		x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
	
	__m256i
		i_mask;

#if 0
	__m128d
		stemp,
		sa_00, sa_01, sa_02, sa_03,
		sx_n_0, sx_n_1, sx_n_2, sx_n_3, sy_n_0,
		sx_t_0, sy_t_0, sy_t_1, sy_t_2, sy_t_3;
#endif
	
	zeros = _mm256_setzero_pd();

	x_n_0 = _mm256_broadcast_sd( &x_n[0] );
	x_n_1 = _mm256_broadcast_sd( &x_n[1] );
	x_n_2 = _mm256_broadcast_sd( &x_n[2] );
	x_n_3 = _mm256_broadcast_sd( &x_n[3] );

	if(alg==-1) // TODO xor
		{
		x_n_0 = _mm256_sub_pd( zeros, x_n_0 );
		x_n_1 = _mm256_sub_pd( zeros, x_n_1 );
		x_n_2 = _mm256_sub_pd( zeros, x_n_2 );
		x_n_3 = _mm256_sub_pd( zeros, x_n_3 );
		}

	y_t_0 = _mm256_setzero_pd();
	y_t_1 = _mm256_setzero_pd();
	y_t_2 = _mm256_setzero_pd();
	y_t_3 = _mm256_setzero_pd();
	
#if 0
	sx_n_0 = _mm256_castpd256_pd128( x_n_0 );
	sx_n_1 = _mm256_castpd256_pd128( x_n_1 );
	sx_n_2 = _mm256_castpd256_pd128( x_n_2 );
	sx_n_3 = _mm256_castpd256_pd128( x_n_3 );

	sy_t_0 = _mm256_castpd256_pd128( y_t_0 );
	sy_t_1 = _mm256_castpd256_pd128( y_t_1 );
	sy_t_2 = _mm256_castpd256_pd128( y_t_2 );
	sy_t_3 = _mm256_castpd256_pd128( y_t_3 );

	k = bs*(ka/bs);
	sA = A + (ka/bs)*sda*bs;
	sy_n = y_n + (ka/bs)*bs;
	sx_t = x_t + (ka/bs)*bs;

	for(; k<ka; k++)
		{
		
		sy_n_0 = _mm_load_sd( &sy_n[0] );
		sx_t_0 = _mm_load_sd( &sx_t[0] );
		
		sa_00 = _mm_load_sd( &sA[0+bs*0] );
		sa_01 = _mm_load_sd( &sA[0+bs*1] );
		sa_02 = _mm_load_sd( &sA[0+bs*2] );
		sa_03 = _mm_load_sd( &sA[0+bs*3] );
		
		stemp  = _mm_mul_sd( sa_00, sx_n_0 );
		sy_n_0 = _mm_add_sd( sy_n_0, stemp );
		stemp  = _mm_mul_sd( sa_00, sx_t_0 );
		sy_t_0 = _mm_add_sd( sy_t_0, stemp );
		stemp  = _mm_mul_sd( sa_01, sx_n_1 );
		sy_n_0 = _mm_add_sd( sy_n_0, stemp );
		stemp  = _mm_mul_sd( sa_01, sx_t_0 );
		sy_t_1 = _mm_add_sd( sy_t_1, stemp );
		stemp  = _mm_mul_sd( sa_02, sx_n_2 );
		sy_n_0 = _mm_add_sd( sy_n_0, stemp );
		stemp  = _mm_mul_sd( sa_02, sx_t_0 );
		sy_t_2 = _mm_add_sd( sy_t_2, stemp );
		stemp  = _mm_mul_sd( sa_03, sx_n_3 );
		sy_n_0 = _mm_add_sd( sy_n_0, stemp );
		stemp  = _mm_mul_sd( sa_03, sx_t_0 );
		sy_t_3 = _mm_add_sd( sy_t_3, stemp );
		
		_mm_store_sd( &sy_n[0], sy_n_0 );

		
		sA += 1;
		sy_n += 1;
		sx_t += 1;

		}

	y_t_0 = _mm256_castpd128_pd256( sy_t_0 );
	y_t_1 = _mm256_castpd128_pd256( sy_t_1 );
	y_t_2 = _mm256_castpd128_pd256( sy_t_2 );
	y_t_3 = _mm256_castpd128_pd256( sy_t_3 );
#endif

	k=0;

	// corner
	if(tri==1)
		{
		
		__builtin_prefetch( A + sda*bs +bs*0 );
		__builtin_prefetch( A + sda*bs +bs*2 );

		y_n_0 = _mm256_loadu_pd( &y_n[0] );
		x_t_0 = _mm256_loadu_pd( &x_t[0] );
		
		a_00 = _mm256_load_pd( &A[0+bs*0] );
		a_01 = _mm256_load_pd( &A[0+bs*1] );
		a_02 = _mm256_load_pd( &A[0+bs*2] );
		a_03 = _mm256_load_pd( &A[0+bs*3] );
		
		temp  = _mm256_mul_pd( a_00, x_n_0 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_00, x_t_0 );
		temp  = _mm256_blend_pd( zeros, temp, 14 );
		y_t_0 = _mm256_add_pd( y_t_0, temp );
		temp  = _mm256_mul_pd( a_01, x_n_1 );
		temp  = _mm256_blend_pd( zeros, temp, 14 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_01, x_t_0 );
		temp  = _mm256_blend_pd( zeros, temp, 12 );
		y_t_1 = _mm256_add_pd( y_t_1, temp );
		temp  = _mm256_mul_pd( a_02, x_n_2 );
		temp  = _mm256_blend_pd( zeros, temp, 12 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_02, x_t_0 );
		temp  = _mm256_blend_pd( zeros, temp, 8 );
		y_t_2 = _mm256_add_pd( y_t_2, temp );
		temp  = _mm256_mul_pd( a_03, x_n_3 );
		temp  = _mm256_blend_pd( zeros, temp, 8 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		
		_mm256_storeu_pd( &z_n[0], y_n_0 );
		

		A += sda*bs;
		y_n += 4;
		z_n += 4;
		x_t += 4;

		k += 4;

		}

	for(; k<ka-7; k+=2*bs)
		{
		
		__builtin_prefetch( A + sda*bs +bs*0 );
		__builtin_prefetch( A + sda*bs +bs*2 );

		y_n_0 = _mm256_loadu_pd( &y_n[0] );
		x_t_0 = _mm256_loadu_pd( &x_t[0] );
		
		a_00 = _mm256_load_pd( &A[0+bs*0] );
		a_01 = _mm256_load_pd( &A[0+bs*1] );
		a_02 = _mm256_load_pd( &A[0+bs*2] );
		a_03 = _mm256_load_pd( &A[0+bs*3] );
		
		temp  = _mm256_mul_pd( a_00, x_n_0 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_00, x_t_0 );
		y_t_0 = _mm256_add_pd( y_t_0, temp );
		temp  = _mm256_mul_pd( a_01, x_n_1 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_01, x_t_0 );
		y_t_1 = _mm256_add_pd( y_t_1, temp );
		temp  = _mm256_mul_pd( a_02, x_n_2 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_02, x_t_0 );
		y_t_2 = _mm256_add_pd( y_t_2, temp );
		temp  = _mm256_mul_pd( a_03, x_n_3 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_03, x_t_0 );
		y_t_3 = _mm256_add_pd( y_t_3, temp );
		
		_mm256_storeu_pd( &z_n[0], y_n_0 );

		
		A += sda*bs;
		y_n += 4;
		z_n += 4;
		x_t += 4;

		__builtin_prefetch( A + sda*bs +bs*0 );
		__builtin_prefetch( A + sda*bs +bs*2 );

		y_n_0 = _mm256_loadu_pd( &y_n[0] );
		x_t_0 = _mm256_loadu_pd( &x_t[0] );
		
		a_00 = _mm256_load_pd( &A[0+bs*0] );
		a_01 = _mm256_load_pd( &A[0+bs*1] );
		a_02 = _mm256_load_pd( &A[0+bs*2] );
		a_03 = _mm256_load_pd( &A[0+bs*3] );
		
		temp  = _mm256_mul_pd( a_00, x_n_0 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_00, x_t_0 );
		y_t_0 = _mm256_add_pd( y_t_0, temp );
		temp  = _mm256_mul_pd( a_01, x_n_1 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_01, x_t_0 );
		y_t_1 = _mm256_add_pd( y_t_1, temp );
		temp  = _mm256_mul_pd( a_02, x_n_2 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_02, x_t_0 );
		y_t_2 = _mm256_add_pd( y_t_2, temp );
		temp  = _mm256_mul_pd( a_03, x_n_3 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_03, x_t_0 );
		y_t_3 = _mm256_add_pd( y_t_3, temp );
		
		_mm256_storeu_pd( &z_n[0], y_n_0 );

		
		A += sda*bs;
		y_n += 4;
		z_n += 4;
		x_t += 4;

		}

	for(; k<ka-3; k+=bs)
		{
		
		__builtin_prefetch( A + sda*bs +bs*0 );
		__builtin_prefetch( A + sda*bs +bs*2 );

		y_n_0 = _mm256_loadu_pd( &y_n[0] );
		x_t_0 = _mm256_loadu_pd( &x_t[0] );
		
		a_00 = _mm256_load_pd( &A[0+bs*0] );
		a_01 = _mm256_load_pd( &A[0+bs*1] );
		a_02 = _mm256_load_pd( &A[0+bs*2] );
		a_03 = _mm256_load_pd( &A[0+bs*3] );
		
		temp  = _mm256_mul_pd( a_00, x_n_0 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_00, x_t_0 );
		y_t_0 = _mm256_add_pd( y_t_0, temp );
		temp  = _mm256_mul_pd( a_01, x_n_1 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_01, x_t_0 );
		y_t_1 = _mm256_add_pd( y_t_1, temp );
		temp  = _mm256_mul_pd( a_02, x_n_2 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_02, x_t_0 );
		y_t_2 = _mm256_add_pd( y_t_2, temp );
		temp  = _mm256_mul_pd( a_03, x_n_3 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_03, x_t_0 );
		y_t_3 = _mm256_add_pd( y_t_3, temp );
		
		_mm256_storeu_pd( &z_n[0], y_n_0 );

		
		A += sda*bs;
		y_n += 4;
		z_n += 4;
		x_t += 4;

		}
	if(k<ka)
		{

		k_left = ka-k;
		v_mask  = _mm256_sub_pd( _mm256_loadu_pd( d_mask ), _mm256_broadcast_sd( &k_left ) );
		i_mask  = _mm256_castpd_si256( v_mask );

//		__builtin_prefetch( A + sda*bs +bs*0 );
//		__builtin_prefetch( A + sda*bs +bs*2 );

		y_n_0 = _mm256_loadu_pd( &y_n[0] );
		x_t_0 = _mm256_maskload_pd( &x_t[0], i_mask );
		
		a_00 = _mm256_load_pd( &A[0+bs*0] );
		a_01 = _mm256_load_pd( &A[0+bs*1] );
		a_02 = _mm256_load_pd( &A[0+bs*2] );
		a_03 = _mm256_load_pd( &A[0+bs*3] );
		
		temp  = _mm256_mul_pd( a_00, x_n_0 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_00, x_t_0 );
		y_t_0 = _mm256_add_pd( y_t_0, temp );
		temp  = _mm256_mul_pd( a_01, x_n_1 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_01, x_t_0 );
		y_t_1 = _mm256_add_pd( y_t_1, temp );
		temp  = _mm256_mul_pd( a_02, x_n_2 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_02, x_t_0 );
		y_t_2 = _mm256_add_pd( y_t_2, temp );
		temp  = _mm256_mul_pd( a_03, x_n_3 );
		y_n_0 = _mm256_add_pd( y_n_0, temp );
		temp  = _mm256_mul_pd( a_03, x_t_0 );
		y_t_3 = _mm256_add_pd( y_t_3, temp );
		
		_mm256_maskstore_pd( &z_n[0], i_mask, y_n_0 );

		
//		A += sda*bs;
//		y_n += 4;
//		z_n += 4;
//		x_t += 4;

		}
	
	__m256d
		y_0_1_2_3;

	y_t_0 = _mm256_hadd_pd( y_t_0, y_t_1 );
	y_t_2 = _mm256_hadd_pd( y_t_2, y_t_3 );

	y_t_1 = _mm256_permute2f128_pd( y_t_2, y_t_0, 2  );	
	y_t_0 = _mm256_permute2f128_pd( y_t_2, y_t_0, 19 );	

	y_t_0 = _mm256_add_pd( y_t_0, y_t_1 );

	if(alg==1)
		{
		y_0_1_2_3 = _mm256_loadu_pd( &y_t[0] );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_t_0 );
		_mm256_storeu_pd( &z_t[0], y_0_1_2_3 );
		}
	else // alg==-1
		{
		y_0_1_2_3 = _mm256_loadu_pd( &y_t[0] );
		y_0_1_2_3 = _mm256_sub_pd( y_0_1_2_3, y_t_0 );
		_mm256_storeu_pd( &z_t[0], y_0_1_2_3 );
		}
	
	}
Exemple #10
0
// it moves horizontally inside a block
void kernel_dgemv_n_8_lib4(int kmax, double *A0, double *A1, double *x, double *y, int alg)
	{
	if(kmax<=0) 
		return;
	
	const int lda = 4;
	
	int k;

	__m256d
		ax_temp,
		a_00_10_20_30, a_01_11_21_31,
		a_40_50_60_70, a_41_51_61_71,
		x_0, x_1,
		y_0_1_2_3, y_0_1_2_3_b, z_0_1_2_3,
		y_4_5_6_7, y_4_5_6_7_b, z_4_5_6_7;
	
	y_0_1_2_3   = _mm256_setzero_pd();	
	y_4_5_6_7   = _mm256_setzero_pd();	
	y_0_1_2_3_b = _mm256_setzero_pd();	
	y_4_5_6_7_b = _mm256_setzero_pd();	

	if(kmax<=64)
		{

		k=0;
		for(; k<kmax-3; k+=4)
			{

/*			__builtin_prefetch( A0 + 4*lda );*/
/*			__builtin_prefetch( A1 + 4*lda );*/

			x_0 = _mm256_broadcast_sd( &x[0] );
			x_1 = _mm256_broadcast_sd( &x[1] );

			a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] );
			a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] );
			a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] );
			a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*1] );

			ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
			y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
			ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 );
			y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp );
			ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
			y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );
			ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 );
			y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp );

/*			__builtin_prefetch( A0 + 5*lda );*/
/*			__builtin_prefetch( A1 + 5*lda );*/

			x_0 = _mm256_broadcast_sd( &x[2] );
			x_1 = _mm256_broadcast_sd( &x[3] );

			a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] );
			a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*2] );
			a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] );
			a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*3] );

			ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
			y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
			ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 );
			y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp );
			ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
			y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );
			ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 );
			y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp );
		
			A0 += 4*lda;
			A1 += 4*lda;
			x  += 4;

			}
		
		}
	else
		{

		k=0;
		for(; k<kmax-3; k+=4)
			{

			__builtin_prefetch( A0 + 4*lda );
			__builtin_prefetch( A1 + 4*lda );

			x_0 = _mm256_broadcast_sd( &x[0] );
			x_1 = _mm256_broadcast_sd( &x[1] );

			a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] );
			a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] );
			a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] );
			a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*1] );

			ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
			y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
			ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 );
			y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp );
			ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
			y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );
			ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 );
			y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp );

/*			__builtin_prefetch( A0 + 5*lda );*/
/*			__builtin_prefetch( A1 + 5*lda );*/
			__builtin_prefetch( A0 + 6*lda );
			__builtin_prefetch( A1 + 6*lda );

			x_0 = _mm256_broadcast_sd( &x[2] );
			x_1 = _mm256_broadcast_sd( &x[3] );

			a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*2] );
			a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*2] );
			a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*3] );
			a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*3] );

			ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
			y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
			ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 );
			y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp );
			ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
			y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );
			ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 );
			y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp );
		
			A0 += 4*lda;
			A1 += 4*lda;
			x  += 4;

			}
	
		}


	if(kmax%4>=2)
		{

		x_0 = _mm256_broadcast_sd( &x[0] );
		x_1 = _mm256_broadcast_sd( &x[1] );

		a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] );
		a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A0[0+lda*1] );
		a_41_51_61_71 = _mm256_load_pd( &A1[0+lda*1] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 );
		y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp );
		ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
		y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );
		ax_temp = _mm256_mul_pd( a_41_51_61_71, x_1 );
		y_4_5_6_7_b = _mm256_add_pd( y_4_5_6_7_b, ax_temp );
		
		A0 += 2*lda;
		A1 += 2*lda;
		x  += 2;

		}
	
	y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_b );
	y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, y_4_5_6_7_b );

	if(kmax%2==1)
		{

		x_0 = _mm256_broadcast_sd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A0[0+lda*0] );
		a_40_50_60_70 = _mm256_load_pd( &A1[0+lda*0] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		ax_temp = _mm256_mul_pd( a_40_50_60_70, x_0 );
		y_4_5_6_7 = _mm256_add_pd( y_4_5_6_7, ax_temp );
		
/*		A0 += 1*lda;*/
/*		A1 += 1*lda;*/
/*		x  += 1;*/

		}

	if(alg==0)
		{
		_mm256_storeu_pd(&y[0], y_0_1_2_3);
		_mm256_storeu_pd(&y[4], y_4_5_6_7);
		}
	else if(alg==1)
		{
		z_0_1_2_3 = _mm256_loadu_pd( &y[0] );
		z_4_5_6_7 = _mm256_loadu_pd( &y[4] );

		z_0_1_2_3 = _mm256_add_pd( z_0_1_2_3, y_0_1_2_3 );
		z_4_5_6_7 = _mm256_add_pd( z_4_5_6_7, y_4_5_6_7 );

		_mm256_storeu_pd(&y[0], z_0_1_2_3);
		_mm256_storeu_pd(&y[4], z_4_5_6_7);
		}
	else // alg==-1
		{
		z_0_1_2_3 = _mm256_loadu_pd( &y[0] );
		z_4_5_6_7 = _mm256_loadu_pd( &y[4] );

		z_0_1_2_3 = _mm256_sub_pd( z_0_1_2_3, y_0_1_2_3 );
		z_4_5_6_7 = _mm256_sub_pd( z_4_5_6_7, y_4_5_6_7 );

		_mm256_storeu_pd(&y[0], z_0_1_2_3);
		_mm256_storeu_pd(&y[4], z_4_5_6_7);
		}

	}
void ks_multiquadratic_int_d8x4(
    int    k,
    int    rhs,
    double *u,
    double *aa,
    double *a,
    double *bb,
    double *b,
    double *w,
    double *c,
    ks_t   *ker,
    aux_t  *aux
    )
{
  int    i, rhs_left;
  double neg2  = -2.0;
  double dzero =  0.0;
  double done  =  1.0;
  double mdone = -1.0;
  double alpha = ( 3.0 / 4.0 );
  double cons  = ker->cons;


  v4df_t    c03_0,    c03_1,    c03_2,    c03_3;
  v4df_t    c47_0,    c47_1,    c47_2,    c47_3;
  v4df_t tmpc03_0, tmpc03_1, tmpc03_2, tmpc03_3;
  v4df_t tmpc47_0, tmpc47_1, tmpc47_2, tmpc47_3;
  v4df_t u03, u47;
  v4df_t a03, a47, A03, A47; // prefetched A 
  v4df_t b0, b1, b2, b3, B0; // prefetched B
  v4df_t c_tmp, aa_tmp, bb_tmp, w_tmp;


  // Rank-k update segment
  #include "ks_rank_k_int_d8x4.h"


  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( aa ) );
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( bb ) );


  // Accumulate
  if ( aux->pc ) {
    tmpc03_0.v = _mm256_load_pd( (double*)( c      ) );
    c03_0.v    = _mm256_add_pd( tmpc03_0.v, c03_0.v );
    tmpc47_0.v = _mm256_load_pd( (double*)( c + 4  ) );
    c47_0.v    = _mm256_add_pd( tmpc47_0.v, c47_0.v );
    tmpc03_1.v = _mm256_load_pd( (double*)( c + 8  ) );
    c03_1.v    = _mm256_add_pd( tmpc03_1.v, c03_1.v );
    tmpc47_1.v = _mm256_load_pd( (double*)( c + 12 ) );
    c47_1.v    = _mm256_add_pd( tmpc47_1.v, c47_1.v );
    tmpc03_2.v = _mm256_load_pd( (double*)( c + 16 ) );
    c03_2.v    = _mm256_add_pd( tmpc03_2.v, c03_2.v );
    tmpc47_2.v = _mm256_load_pd( (double*)( c + 20 ) );
    c47_2.v    = _mm256_add_pd( tmpc47_2.v, c47_2.v );
    tmpc03_3.v = _mm256_load_pd( (double*)( c + 24 ) );
    c03_3.v    = _mm256_add_pd( tmpc03_3.v, c03_3.v );
    tmpc47_3.v = _mm256_load_pd( (double*)( c + 28 ) );
    c47_3.v    = _mm256_add_pd( tmpc47_3.v, c47_3.v );
  }


  // Scale -2
  aa_tmp.v = _mm256_broadcast_sd( &neg2 );
  c03_0.v  = _mm256_mul_pd( aa_tmp.v, c03_0.v );
  c03_1.v  = _mm256_mul_pd( aa_tmp.v, c03_1.v );
  c03_2.v  = _mm256_mul_pd( aa_tmp.v, c03_2.v );
  c03_3.v  = _mm256_mul_pd( aa_tmp.v, c03_3.v );
  c47_0.v  = _mm256_mul_pd( aa_tmp.v, c47_0.v );
  c47_1.v  = _mm256_mul_pd( aa_tmp.v, c47_1.v );
  c47_2.v  = _mm256_mul_pd( aa_tmp.v, c47_2.v );
  c47_3.v  = _mm256_mul_pd( aa_tmp.v, c47_3.v );


  aa_tmp.v = _mm256_load_pd( (double*)aa );
  c03_0.v  = _mm256_add_pd( aa_tmp.v, c03_0.v );
  c03_1.v  = _mm256_add_pd( aa_tmp.v, c03_1.v );
  c03_2.v  = _mm256_add_pd( aa_tmp.v, c03_2.v );
  c03_3.v  = _mm256_add_pd( aa_tmp.v, c03_3.v );


  aa_tmp.v = _mm256_load_pd( (double*)( aa + 4 ) );
  c47_0.v  = _mm256_add_pd( aa_tmp.v, c47_0.v );
  c47_1.v  = _mm256_add_pd( aa_tmp.v, c47_1.v );
  c47_2.v  = _mm256_add_pd( aa_tmp.v, c47_2.v );
  c47_3.v  = _mm256_add_pd( aa_tmp.v, c47_3.v );
  

  bb_tmp.v = _mm256_broadcast_sd( (double*)bb );
  c03_0.v  = _mm256_add_pd( bb_tmp.v, c03_0.v );
  c47_0.v  = _mm256_add_pd( bb_tmp.v, c47_0.v );

  bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 1 ) );
  c03_1.v  = _mm256_add_pd( bb_tmp.v, c03_1.v );
  c47_1.v  = _mm256_add_pd( bb_tmp.v, c47_1.v );

  bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 2 ) );
  c03_2.v  = _mm256_add_pd( bb_tmp.v, c03_2.v );
  c47_2.v  = _mm256_add_pd( bb_tmp.v, c47_2.v );

  bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 3 ) );
  c03_3.v  = _mm256_add_pd( bb_tmp.v, c03_3.v );
  c47_3.v  = _mm256_add_pd( bb_tmp.v, c47_3.v );


  // Check if there is any illegle value 
  c_tmp.v  = _mm256_broadcast_sd( &dzero );
  c03_0.v  = _mm256_max_pd( c_tmp.v, c03_0.v );
  c03_1.v  = _mm256_max_pd( c_tmp.v, c03_1.v );
  c03_2.v  = _mm256_max_pd( c_tmp.v, c03_2.v );
  c03_3.v  = _mm256_max_pd( c_tmp.v, c03_3.v );
  c47_0.v  = _mm256_max_pd( c_tmp.v, c47_0.v );
  c47_1.v  = _mm256_max_pd( c_tmp.v, c47_1.v );
  c47_2.v  = _mm256_max_pd( c_tmp.v, c47_2.v );
  c47_3.v  = _mm256_max_pd( c_tmp.v, c47_3.v );


  // Preload u03, u47
  u03.v    = _mm256_load_pd( (double*)u );
  u47.v    = _mm256_load_pd( (double*)( u + 4 ) );


  // Prefetch u and w
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( u + 8 ) );
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( w ) );


  // c = c + cons
  c_tmp.v  = _mm256_broadcast_sd( &cons );
  c03_0.v  = _mm256_add_pd( c_tmp.v, c03_0.v );
  c03_1.v  = _mm256_add_pd( c_tmp.v, c03_1.v );
  c03_2.v  = _mm256_add_pd( c_tmp.v, c03_2.v );
  c03_3.v  = _mm256_add_pd( c_tmp.v, c03_3.v );
  c47_0.v  = _mm256_add_pd( c_tmp.v, c47_0.v );
  c47_1.v  = _mm256_add_pd( c_tmp.v, c47_1.v );
  c47_2.v  = _mm256_add_pd( c_tmp.v, c47_2.v );
  c47_3.v  = _mm256_add_pd( c_tmp.v, c47_3.v );

  // Multiple rhs kernel summation.
  #include "ks_kernel_summation_int_d8x4.h"

}
Exemple #12
0
void rnn_int_d8x4_var2(
    int    k,
    double *aa,
    double *a,
    double *bb,
    double *b,
    double *c,
    aux_t  *aux
    )
{
  int    i;
  double neg2 = -2.0;
  double dzero = 0.0;
  v4df_t c03_0, c03_1, c03_2, c03_3;
  v4df_t c47_0, c47_1, c47_2, c47_3;
  v4df_t tmpc03_0, tmpc03_1, tmpc03_2, tmpc03_3;
  v4df_t tmpc47_0, tmpc47_1, tmpc47_2, tmpc47_3;
  v4df_t c_tmp;
  v4df_t a03, a47;
  v4df_t A03, A47; // prefetched A 

  v4df_t b0, b1, b2, b3;
  v4df_t B0; // prefetched B
  v4df_t aa_tmp, bb_tmp;


  int k_iter = k / 2;
  int k_left = k % 2;

  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( a ) );
  __asm__ volatile( "prefetcht2 0(%0)    \n\t" : :"r"( aux->b_next ) );
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( c ) );


  c03_0.v = _mm256_setzero_pd();
  c03_1.v = _mm256_setzero_pd();
  c03_2.v = _mm256_setzero_pd();
  c03_3.v = _mm256_setzero_pd();
  c47_0.v = _mm256_setzero_pd();
  c47_1.v = _mm256_setzero_pd();
  c47_2.v = _mm256_setzero_pd();
  c47_3.v = _mm256_setzero_pd();


  // Load a03
  a03.v = _mm256_load_pd(      (double*)a         );
  // Load a47
  a47.v = _mm256_load_pd(      (double*)( a + 4 ) );
  // Load (b0,b1,b2,b3)
  b0.v  = _mm256_load_pd(      (double*)b         );

  for ( i = 0; i < k_iter; ++i ) {
    __asm__ volatile( "prefetcht0 192(%0)    \n\t" : :"r"(a) );

    // Preload A03
    A03.v = _mm256_load_pd(      (double*)( a + 8 ) );

    c_tmp.v = _mm256_mul_pd( a03.v  , b0.v    );
    c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b0.v    );
    c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v );

    // Preload A47
    A47.v = _mm256_load_pd(      (double*)( a + 12 ) );

    // Shuffle b ( 1, 0, 3, 2 )
    b1.v  = _mm256_shuffle_pd( b0.v, b0.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b1.v    );
    c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b1.v    );
    c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v );

    // Permute b ( 3, 2, 1, 0 )
    b2.v  = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 );

    // Preload B0
    B0.v  = _mm256_load_pd(      (double*)( b + 4 ) );

    c_tmp.v = _mm256_mul_pd( a03.v  , b2.v    );
    c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b2.v    );
    c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v );

    // Shuffle b ( 3, 2, 1, 0 )
    b3.v  = _mm256_shuffle_pd( b2.v, b2.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b3.v    );
    c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b3.v    );
    c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v );


    // Iteration #1
    __asm__ volatile( "prefetcht0 512(%0)    \n\t" : :"r"(a) );

    // Preload a03 ( next iteration )
    a03.v = _mm256_load_pd(      (double*)( a + 16 ) );

    c_tmp.v = _mm256_mul_pd( A03.v  , B0.v    );
    c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v );

    b1.v  = _mm256_shuffle_pd( B0.v, B0.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( A47.v  , B0.v    );
    c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v );
    c_tmp.v = _mm256_mul_pd( A03.v  , b1.v    );
    c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v );

    // Preload a47 ( next iteration )
    a47.v = _mm256_load_pd(      (double*)( a + 20 ) );

    // Permute b ( 3, 2, 1, 0 )
    b2.v  = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 );

    c_tmp.v = _mm256_mul_pd( A47.v  , b1.v    );
    c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v );
    c_tmp.v = _mm256_mul_pd( A03.v  , b2.v    );
    c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v );

    // Shuffle b ( 3, 2, 1, 0 )
    b3.v  = _mm256_shuffle_pd( b2.v, b2.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( A47.v  , b2.v    );
    c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v );

    // Load b0 ( next iteration )
    b0.v  = _mm256_load_pd(      (double*)( b + 8 ) );

    c_tmp.v = _mm256_mul_pd( A03.v  , b3.v    );
    c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v );
    c_tmp.v = _mm256_mul_pd( A47.v  , b3.v    );
    c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v );

    a += 16;
    b += 8;
  }

  for ( i = 0; i < k_left; ++i ) {
    a03.v = _mm256_load_pd(      (double*)a         );
    //printf( "a03 = %lf, %lf, %lf, %lf\n", a03.d[0], a03.d[1], a03.d[2], a03.d[3] );

    a47.v = _mm256_load_pd(      (double*)( a + 4 ) );
    //printf( "a47 = %lf, %lf, %lf, %lf\n", a47.d[0], a47.d[1], a47.d[2], a47.d[3] );

    b0.v  = _mm256_load_pd(      (double*)b         );
    //printf( "b0  = %lf, %lf, %lf, %lf\n", b0.d[0], b0.d[1], b0.d[2], b0.d[3] );

    c_tmp.v = _mm256_mul_pd( a03.v  , b0.v    );
    c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b0.v    );
    c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v );

    // Shuffle b ( 1, 0, 3, 2 )
    b1.v  = _mm256_shuffle_pd( b0.v, b0.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b1.v    );
    c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b1.v    );
    c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v );

    // Permute b ( 3, 2, 1, 0 )
    b2.v  = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b2.v    );
    c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b2.v    );
    c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v );

    // Shuffle b ( 3, 2, 1, 0 )
    b3.v  = _mm256_shuffle_pd( b2.v, b2.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b3.v    );
    c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b3.v    );
    c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v );

    a += 8;
    b += 4;
  }
 

  // Prefetch aa and bb
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( aa ) );
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( bb ) );


  tmpc03_0.v = _mm256_blend_pd( c03_0.v, c03_1.v, 0x6 );
  tmpc03_1.v = _mm256_blend_pd( c03_1.v, c03_0.v, 0x6 );
  
  tmpc03_2.v = _mm256_blend_pd( c03_2.v, c03_3.v, 0x6 );
  tmpc03_3.v = _mm256_blend_pd( c03_3.v, c03_2.v, 0x6 );

  tmpc47_0.v = _mm256_blend_pd( c47_0.v, c47_1.v, 0x6 );
  tmpc47_1.v = _mm256_blend_pd( c47_1.v, c47_0.v, 0x6 );

  tmpc47_2.v = _mm256_blend_pd( c47_2.v, c47_3.v, 0x6 );
  tmpc47_3.v = _mm256_blend_pd( c47_3.v, c47_2.v, 0x6 );

  c03_0.v    = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x30 );
  c03_3.v    = _mm256_permute2f128_pd( tmpc03_2.v, tmpc03_0.v, 0x30 );

  c03_1.v    = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x30 );
  c03_2.v    = _mm256_permute2f128_pd( tmpc03_3.v, tmpc03_1.v, 0x30 );

  c47_0.v    = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x30 );
  c47_3.v    = _mm256_permute2f128_pd( tmpc47_2.v, tmpc47_0.v, 0x30 );

  c47_1.v    = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x30 );
  c47_2.v    = _mm256_permute2f128_pd( tmpc47_3.v, tmpc47_1.v, 0x30 );

  //printf( "rank-k\n" );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] );

  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( aux->I ) );
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( aux->D ) );


  //for ( i = 0; i < k; i++ ) {
  //  a03.v = _mm256_load_pd(      (double*)a         );
  //  a47.v = _mm256_load_pd(      (double*)( a + 4 ) );
  //  b0.v  = _mm256_broadcast_sd( (double*)b         );
  //  b1.v  = _mm256_broadcast_sd( (double*)( b + 1 ) );
  //  b2.v  = _mm256_broadcast_sd( (double*)( b + 2 ) );
  //  b3.v  = _mm256_broadcast_sd( (double*)( b + 3 ) );

  //  a += DKS_MR;
  //  b += DKS_NR;

  //  c_tmp.v = _mm256_mul_pd( a03.v  , b0.v    );
  //  c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v );
  //  c_tmp.v = _mm256_mul_pd( a03.v  , b1.v    );
  //  c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v );
  //  c_tmp.v = _mm256_mul_pd( a03.v  , b2.v    );
  //  c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v );
  //  c_tmp.v = _mm256_mul_pd( a03.v  , b3.v    );
  //  c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v );

  //  c_tmp.v = _mm256_mul_pd( a47.v  , b0.v    );
  //  c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v );
  //  c_tmp.v = _mm256_mul_pd( a47.v  , b1.v    );
  //  c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v );
  //  c_tmp.v = _mm256_mul_pd( a47.v  , b2.v    );
  //  c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v );
  //  c_tmp.v = _mm256_mul_pd( a47.v  , b3.v    );
  //  c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v );
  //}
  
  aa_tmp.v = _mm256_broadcast_sd( &neg2 );
  //c03_0.v  = _mm256_mul_pd( aa_tmp.v, c03_0.v );
  //c03_1.v  = _mm256_mul_pd( aa_tmp.v, c03_1.v );
  //c03_2.v  = _mm256_mul_pd( aa_tmp.v, c03_2.v );
  //c03_3.v  = _mm256_mul_pd( aa_tmp.v, c03_3.v );
  //c47_0.v  = _mm256_mul_pd( aa_tmp.v, c47_0.v );
  //c47_1.v  = _mm256_mul_pd( aa_tmp.v, c47_1.v );
  //c47_2.v  = _mm256_mul_pd( aa_tmp.v, c47_2.v );
  //c47_3.v  = _mm256_mul_pd( aa_tmp.v, c47_3.v );
  //
  c03_0.v  = _mm256_mul_pd( aa_tmp.v, c03_0.v );
  c03_1.v  = _mm256_mul_pd( aa_tmp.v, c03_1.v );
  c03_2.v  = _mm256_mul_pd( aa_tmp.v, c03_2.v );
  c03_3.v  = _mm256_mul_pd( aa_tmp.v, c03_3.v );
  c47_0.v  = _mm256_mul_pd( aa_tmp.v, c47_0.v );
  c47_1.v  = _mm256_mul_pd( aa_tmp.v, c47_1.v );
  c47_2.v  = _mm256_mul_pd( aa_tmp.v, c47_2.v );
  c47_3.v  = _mm256_mul_pd( aa_tmp.v, c47_3.v );


  //printf( "scale -2 \n" );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] );


  aa_tmp.v = _mm256_load_pd( (double*)aa );
  c03_0.v  = _mm256_add_pd( aa_tmp.v, c03_0.v );
  c03_1.v  = _mm256_add_pd( aa_tmp.v, c03_1.v );
  c03_2.v  = _mm256_add_pd( aa_tmp.v, c03_2.v );
  c03_3.v  = _mm256_add_pd( aa_tmp.v, c03_3.v );

  //printf( "aa03 = %lf, %lf, %lf, %lf\n", aa_tmp.d[0], aa_tmp.d[1], aa_tmp.d[2], aa_tmp.d[3] );
  //printf( "bb03 = %lf, %lf, %lf, %lf\n", bb[ 0 ], bb[ 1 ], bb[ 2 ], bb[ 3 ] );

  aa_tmp.v = _mm256_load_pd( (double*)( aa + 4 ) );
  c47_0.v  = _mm256_add_pd( aa_tmp.v, c47_0.v );
  c47_1.v  = _mm256_add_pd( aa_tmp.v, c47_1.v );
  c47_2.v  = _mm256_add_pd( aa_tmp.v, c47_2.v );
  c47_3.v  = _mm256_add_pd( aa_tmp.v, c47_3.v );
  

  //printf( "add a^2\n" );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] );


  bb_tmp.v = _mm256_broadcast_sd( (double*)bb );
  c03_0.v  = _mm256_add_pd( bb_tmp.v, c03_0.v );
  c47_0.v  = _mm256_add_pd( bb_tmp.v, c47_0.v );

  bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 1 ) );
  c03_1.v  = _mm256_add_pd( bb_tmp.v, c03_1.v );
  c47_1.v  = _mm256_add_pd( bb_tmp.v, c47_1.v );

  bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 2 ) );
  c03_2.v  = _mm256_add_pd( bb_tmp.v, c03_2.v );
  c47_2.v  = _mm256_add_pd( bb_tmp.v, c47_2.v );

  bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 3 ) );
  c03_3.v  = _mm256_add_pd( bb_tmp.v, c03_3.v );
  c47_3.v  = _mm256_add_pd( bb_tmp.v, c47_3.v );



  // Check if there is any illegle value 
  c_tmp.v  = _mm256_broadcast_sd( &dzero );
  c03_0.v  = _mm256_max_pd( c_tmp.v, c03_0.v );
  c03_1.v  = _mm256_max_pd( c_tmp.v, c03_1.v );
  c03_2.v  = _mm256_max_pd( c_tmp.v, c03_2.v );
  c03_3.v  = _mm256_max_pd( c_tmp.v, c03_3.v );
  c47_0.v  = _mm256_max_pd( c_tmp.v, c47_0.v );
  c47_1.v  = _mm256_max_pd( c_tmp.v, c47_1.v );
  c47_2.v  = _mm256_max_pd( c_tmp.v, c47_2.v );
  c47_3.v  = _mm256_max_pd( c_tmp.v, c47_3.v );


  // Transpose c03/c47 _0, _1, _2, _3 to be the row vector
  tmpc03_0.v = _mm256_shuffle_pd( c03_0.v, c03_1.v, 0x0 );
  tmpc03_1.v = _mm256_shuffle_pd( c03_0.v, c03_1.v, 0xF );

  tmpc03_2.v = _mm256_shuffle_pd( c03_2.v, c03_3.v, 0x0 );
  tmpc03_3.v = _mm256_shuffle_pd( c03_2.v, c03_3.v, 0xF );

  tmpc47_0.v = _mm256_shuffle_pd( c47_0.v, c47_1.v, 0x0 );
  tmpc47_1.v = _mm256_shuffle_pd( c47_0.v, c47_1.v, 0xF );

  tmpc47_2.v = _mm256_shuffle_pd( c47_2.v, c47_3.v, 0x0 );
  tmpc47_3.v = _mm256_shuffle_pd( c47_2.v, c47_3.v, 0xF );

  c03_0.v    = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x20 );
  c03_2.v    = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x31 );

  c03_1.v    = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x20 );
  c03_3.v    = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x31 );

  c47_0.v    = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x20 );
  c47_2.v    = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x31 );

  c47_1.v    = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x20 );
  c47_3.v    = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x31 );


  // c03_0;
  // c03_1;
  // c03_2;
  // c03_3;
  // c47_0;
  // c47_1;
  // c47_2;
  // c47_3;


  _mm256_store_pd( c     , c03_0.v );
  _mm256_store_pd( c +  4, c03_1.v );
  _mm256_store_pd( c +  8, c03_2.v );
  _mm256_store_pd( c + 12, c03_3.v );
  _mm256_store_pd( c + 16, c47_0.v );
  _mm256_store_pd( c + 20, c47_1.v );
  _mm256_store_pd( c + 24, c47_2.v );
  _mm256_store_pd( c + 28, c47_3.v );
}
Exemple #13
0
void tanh_int_d8x6(
    int    k,
    int    rhs,
    //double *h,
    double *u,
    double *aa,
    double *a,
    double *bb,
    double *b,
    double *w,
    double *c,
    ks_t   *ker,
    aux_t  *aux
    )
{
  int    i;
  double scal = ker->scal;
  double cons = ker->cons;
  // 16 registers.
  v4df_t c03_0, c03_1, c03_2, c03_3, c03_4, c03_5;
  v4df_t c47_0, c47_1, c47_2, c47_3, c47_4, c47_5;
  v4df_t a03, a47, b0, b1;

  #include <rank_k_int_d8x6.h>

  // Prefetch u, w
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( u ) );
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( w ) );

  // c = c * scal
  a03.v   = _mm256_broadcast_sd( &scal );
  c03_0.v = _mm256_mul_pd( a03.v, c03_0.v );
  c03_1.v = _mm256_mul_pd( a03.v, c03_1.v );
  c03_2.v = _mm256_mul_pd( a03.v, c03_2.v );
  c03_3.v = _mm256_mul_pd( a03.v, c03_3.v );
  c03_4.v = _mm256_mul_pd( a03.v, c03_4.v );
  c03_5.v = _mm256_mul_pd( a03.v, c03_5.v );

  c47_0.v = _mm256_mul_pd( a03.v, c47_0.v );
  c47_1.v = _mm256_mul_pd( a03.v, c47_1.v );
  c47_2.v = _mm256_mul_pd( a03.v, c47_2.v );
  c47_3.v = _mm256_mul_pd( a03.v, c47_3.v );
  c47_4.v = _mm256_mul_pd( a03.v, c47_4.v );
  c47_5.v = _mm256_mul_pd( a03.v, c47_5.v );

  // c = c + cons
  a03.v   = _mm256_broadcast_sd( &cons );
  c03_0.v = _mm256_add_pd( a03.v, c03_0.v );
  c03_1.v = _mm256_add_pd( a03.v, c03_1.v );
  c03_2.v = _mm256_add_pd( a03.v, c03_2.v );
  c03_3.v = _mm256_add_pd( a03.v, c03_3.v );
  c03_4.v = _mm256_add_pd( a03.v, c03_4.v );
  c03_5.v = _mm256_add_pd( a03.v, c03_5.v );

  c47_0.v = _mm256_add_pd( a03.v, c47_0.v );
  c47_1.v = _mm256_add_pd( a03.v, c47_1.v );
  c47_2.v = _mm256_add_pd( a03.v, c47_2.v );
  c47_3.v = _mm256_add_pd( a03.v, c47_3.v );
  c47_4.v = _mm256_add_pd( a03.v, c47_4.v );
  c47_5.v = _mm256_add_pd( a03.v, c47_5.v );

  // c = tanh( c );
  c03_0.v  = _mm256_tanh_pd( c03_0.v );
  c03_1.v  = _mm256_tanh_pd( c03_1.v );
  c03_2.v  = _mm256_tanh_pd( c03_2.v );
  c03_3.v  = _mm256_tanh_pd( c03_3.v );
  c03_4.v  = _mm256_tanh_pd( c03_4.v );
  c03_5.v  = _mm256_tanh_pd( c03_5.v );

  c47_0.v  = _mm256_tanh_pd( c47_0.v );
  c47_1.v  = _mm256_tanh_pd( c47_1.v );
  c47_2.v  = _mm256_tanh_pd( c47_2.v );
  c47_3.v  = _mm256_tanh_pd( c47_3.v );
  c47_4.v  = _mm256_tanh_pd( c47_4.v );
  c47_5.v  = _mm256_tanh_pd( c47_5.v );
  
  // Preload u03, u47
  a03.v    = _mm256_load_pd( (double*)  u       );
  a47.v    = _mm256_load_pd( (double*)( u + 4 ) );

  // Multiple rhs weighted sum.
  #include<weighted_sum_int_d8x6.h>
}
Exemple #14
0
void ks_gaussian_int_d8x4(
    int    k,
    double alpha,
    double *u,
    double *aa,
    double *a,
    double *bb,
    double *b,
    double *w,
    aux_t  *aux
    )
{
  int    i;
  double neg2 = -2.0;
  double dzero = 0.0;

  v4df_t c03_0, c03_1, c03_2, c03_3;
  v4df_t c47_0, c47_1, c47_2, c47_3;
  v4df_t tmpc03_0, tmpc03_1, tmpc03_2, tmpc03_3;
  v4df_t tmpc47_0, tmpc47_1, tmpc47_2, tmpc47_3;
  v4df_t c_tmp;
  v4df_t u03;
  v4df_t u47;
  v4df_t a03, a47;
  v4df_t A03, A47; // prefetched A 

  v4df_t b0, b1, b2, b3;
  v4df_t B0; // prefetched B

  v4df_t aa_tmp, bb_tmp;
  v4df_t w_tmp;


  //// Inline vdExp()
  //const double log2e  =  1.4426950408889634073599;
  //const double maxlog =  7.09782712893383996843e2; // log( 2**1024 )
  //const double minlog = -7.08396418532264106224e2; // log( 2**-1024 )
  //const double one    =  1.0;
  //const double c1     =  6.93145751953125E-1;
  //const double c2     =  1.42860682030941723212E-6;

  //// Original Remez Order 11 coefficients
  //const double w11    =  3.5524625185478232665958141148891055719216674475023e-8;
  //const double w10    =  2.5535368519306500343384723775435166753084614063349e-7;
  //const double w9     =  2.77750562801295315877005242757916081614772210463065e-6;
  //const double w8     =  2.47868893393199945541176652007657202642495832996107e-5;
  //const double w7     =  1.98419213985637881240770890090795533564573406893163e-4;
  //const double w6     =  1.3888869684178659239014256260881685824525255547326e-3;
  //const double w5     =  8.3333337052009872221152811550156335074160546333973e-3;
  //const double w4     =  4.1666666621080810610346717440523105184720007971655e-2;
  //const double w3     =  0.166666666669960803484477734308515404418108830469798;
  //const double w2     =  0.499999999999877094481580370323249951329122224389189;
  //const double w1     =  1.0000000000000017952745258419615282194236357388884;
  //const double w0     =  0.99999999999999999566016490920259318691496540598896;

  // Remez Order 11 polynomail approximation
  //const double w0     =  9.9999999999999999694541216787022234814339814028865e-1;
  //const double w1     =  1.0000000000000013347525109964212249781265243645457;
  //const double w2     =  4.9999999999990426011279542064313207349934058355357e-1;
  //const double w3     =  1.6666666666933781279020916199156875162816850273886e-1;
  //const double w4     =  4.1666666628388978913396218847247771982698350546174e-2;
  //const double w5     =  8.3333336552944126722390410619859929515740995889372e-3;
  //const double w6     =  1.3888871805082296012945081624687544823497126781709e-3;
  //const double w7     =  1.9841863599469418342286677256362193951266072398489e-4;
  //const double w8     =  2.4787899938611697691690479138150629377630767114546e-5;
  //const double w9     =  2.7764095757136528235740765949934667970688427190168e-6;
  //const double w10    =  2.5602485412126369546033948405199058329040797134573e-7;
  //const double w11    =  3.5347283721656121939634391175390704621351283546671e-8;

  // Remez Order 9 polynomail approximation
//  const double w0     =  9.9999999999998657717890998293462356769270934668652e-1;
//  const double w1     =  1.0000000000041078023971691258305486059867172736079;
//  const double w2     =  4.9999999979496223000111361187419539211772440139043e-1;
//  const double w3     =  1.6666667059968250851708016603646727895353772273675e-1;
//  const double w4     =  4.1666628655740875994884332519499013211594753124142e-2;
//  const double w5     =  8.3335428149736685441705398632467122758546893330069e-3;
//  const double w6     =  1.3881912931358424526285652289974115047170651985345e-3;
//  const double w7     =  1.9983735415194021112767942931416179152416729204150e-4;
//  const double w8     =  2.3068467290270483679711135625155862511780587976925e-5;
//  const double w9     =  3.8865682386514872192656192137071689334005518164704e-6;




  //v4df_t a03_0, a03_1, a03_2, a03_3;
  //v4df_t a47_0, a47_1, a47_2, a47_3;
  //v4df_t p03_0, p03_1, p03_2, p03_3;
  //v4df_t p47_0, p47_1, p47_2, p47_3;
  //v4df_t y, l2e, tmp, p;
  //v4li_t k03_0, k03_1, k03_2, k03_3;
  //v4li_t k47_0, k47_1, k47_2, k47_3;
  //v4li_t offset;
  //v4li_t k1, k2;
  //__m128d p1, p2;









  int k_iter = k / 2;
  int k_left = k % 2;

  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( a ) );
  __asm__ volatile( "prefetcht2 0(%0)    \n\t" : :"r"( aux->b_next ) );


  c03_0.v = _mm256_setzero_pd();
  c03_1.v = _mm256_setzero_pd();
  c03_2.v = _mm256_setzero_pd();
  c03_3.v = _mm256_setzero_pd();
  c47_0.v = _mm256_setzero_pd();
  c47_1.v = _mm256_setzero_pd();
  c47_2.v = _mm256_setzero_pd();
  c47_3.v = _mm256_setzero_pd();


  // Load a03
  a03.v = _mm256_load_pd(      (double*)a         );
  // Load a47
  a47.v = _mm256_load_pd(      (double*)( a + 4 ) );
  // Load (b0,b1,b2,b3)
  b0.v  = _mm256_load_pd(      (double*)b         );

  for ( i = 0; i < k_iter; ++i ) {
    __asm__ volatile( "prefetcht0 192(%0)    \n\t" : :"r"(a) );

    // Preload A03
    A03.v = _mm256_load_pd(      (double*)( a + 8 ) );

    c_tmp.v = _mm256_mul_pd( a03.v  , b0.v    );
    c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b0.v    );
    c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v );

    // Preload A47
    A47.v = _mm256_load_pd(      (double*)( a + 12 ) );

    // Shuffle b ( 1, 0, 3, 2 )
    b1.v  = _mm256_shuffle_pd( b0.v, b0.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b1.v    );
    c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b1.v    );
    c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v );

    // Permute b ( 3, 2, 1, 0 )
    b2.v  = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 );

    // Preload B0
    B0.v  = _mm256_load_pd(      (double*)( b + 4 ) );

    c_tmp.v = _mm256_mul_pd( a03.v  , b2.v    );
    c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b2.v    );
    c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v );

    // Shuffle b ( 3, 2, 1, 0 )
    b3.v  = _mm256_shuffle_pd( b2.v, b2.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b3.v    );
    c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b3.v    );
    c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v );


    // Iteration #1
    __asm__ volatile( "prefetcht0 512(%0)    \n\t" : :"r"(a) );

    // Preload a03 ( next iteration )
    a03.v = _mm256_load_pd(      (double*)( a + 16 ) );

    c_tmp.v = _mm256_mul_pd( A03.v  , B0.v    );
    c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v );

    b1.v  = _mm256_shuffle_pd( B0.v, B0.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( A47.v  , B0.v    );
    c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v );
    c_tmp.v = _mm256_mul_pd( A03.v  , b1.v    );
    c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v );

    // Preload a47 ( next iteration )
    a47.v = _mm256_load_pd(      (double*)( a + 20 ) );

    // Permute b ( 3, 2, 1, 0 )
    b2.v  = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 );

    c_tmp.v = _mm256_mul_pd( A47.v  , b1.v    );
    c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v );
    c_tmp.v = _mm256_mul_pd( A03.v  , b2.v    );
    c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v );

    // Shuffle b ( 3, 2, 1, 0 )
    b3.v  = _mm256_shuffle_pd( b2.v, b2.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( A47.v  , b2.v    );
    c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v );

    // Load b0 ( next iteration )
    b0.v  = _mm256_load_pd(      (double*)( b + 8 ) );

    c_tmp.v = _mm256_mul_pd( A03.v  , b3.v    );
    c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v );
    c_tmp.v = _mm256_mul_pd( A47.v  , b3.v    );
    c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v );

    a += 16;
    b += 8;
  }

  for ( i = 0; i < k_left; ++i ) {
    a03.v = _mm256_load_pd(      (double*)a         );
    //printf( "a03 = %lf, %lf, %lf, %lf\n", a03.d[0], a03.d[1], a03.d[2], a03.d[3] );

    a47.v = _mm256_load_pd(      (double*)( a + 4 ) );
    //printf( "a47 = %lf, %lf, %lf, %lf\n", a47.d[0], a47.d[1], a47.d[2], a47.d[3] );

    b0.v  = _mm256_load_pd(      (double*)b         );
    //printf( "b0  = %lf, %lf, %lf, %lf\n", b0.d[0], b0.d[1], b0.d[2], b0.d[3] );

    c_tmp.v = _mm256_mul_pd( a03.v  , b0.v    );
    c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b0.v    );
    c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v );

    // Shuffle b ( 1, 0, 3, 2 )
    b1.v  = _mm256_shuffle_pd( b0.v, b0.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b1.v    );
    c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b1.v    );
    c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v );

    // Permute b ( 3, 2, 1, 0 )
    b2.v  = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b2.v    );
    c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b2.v    );
    c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v );

    // Shuffle b ( 3, 2, 1, 0 )
    b3.v  = _mm256_shuffle_pd( b2.v, b2.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b3.v    );
    c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b3.v    );
    c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v );

    a += 8;
    b += 4;
  }
 

  // Prefetch aa and bb
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( aa ) );
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( bb ) );


  tmpc03_0.v = _mm256_blend_pd( c03_0.v, c03_1.v, 0x6 );
  tmpc03_1.v = _mm256_blend_pd( c03_1.v, c03_0.v, 0x6 );
  
  tmpc03_2.v = _mm256_blend_pd( c03_2.v, c03_3.v, 0x6 );
  tmpc03_3.v = _mm256_blend_pd( c03_3.v, c03_2.v, 0x6 );

  tmpc47_0.v = _mm256_blend_pd( c47_0.v, c47_1.v, 0x6 );
  tmpc47_1.v = _mm256_blend_pd( c47_1.v, c47_0.v, 0x6 );

  tmpc47_2.v = _mm256_blend_pd( c47_2.v, c47_3.v, 0x6 );
  tmpc47_3.v = _mm256_blend_pd( c47_3.v, c47_2.v, 0x6 );

  c03_0.v    = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x30 );
  c03_3.v    = _mm256_permute2f128_pd( tmpc03_2.v, tmpc03_0.v, 0x30 );

  c03_1.v    = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x30 );
  c03_2.v    = _mm256_permute2f128_pd( tmpc03_3.v, tmpc03_1.v, 0x30 );

  c47_0.v    = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x30 );
  c47_3.v    = _mm256_permute2f128_pd( tmpc47_2.v, tmpc47_0.v, 0x30 );

  c47_1.v    = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x30 );
  c47_2.v    = _mm256_permute2f128_pd( tmpc47_3.v, tmpc47_1.v, 0x30 );

  //printf( "rank-k\n" );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] );



  //for ( i = 0; i < k; i++ ) {
  //  a03.v = _mm256_load_pd(      (double*)a         );
  //  a47.v = _mm256_load_pd(      (double*)( a + 4 ) );
  //  b0.v  = _mm256_broadcast_sd( (double*)b         );
  //  b1.v  = _mm256_broadcast_sd( (double*)( b + 1 ) );
  //  b2.v  = _mm256_broadcast_sd( (double*)( b + 2 ) );
  //  b3.v  = _mm256_broadcast_sd( (double*)( b + 3 ) );

  //  a += DKS_MR;
  //  b += DKS_NR;

  //  c_tmp.v = _mm256_mul_pd( a03.v  , b0.v    );
  //  c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v );
  //  c_tmp.v = _mm256_mul_pd( a03.v  , b1.v    );
  //  c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v );
  //  c_tmp.v = _mm256_mul_pd( a03.v  , b2.v    );
  //  c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v );
  //  c_tmp.v = _mm256_mul_pd( a03.v  , b3.v    );
  //  c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v );

  //  c_tmp.v = _mm256_mul_pd( a47.v  , b0.v    );
  //  c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v );
  //  c_tmp.v = _mm256_mul_pd( a47.v  , b1.v    );
  //  c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v );
  //  c_tmp.v = _mm256_mul_pd( a47.v  , b2.v    );
  //  c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v );
  //  c_tmp.v = _mm256_mul_pd( a47.v  , b3.v    );
  //  c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v );
  //}
  
  aa_tmp.v = _mm256_broadcast_sd( &neg2 );
  //c03_0.v  = _mm256_mul_pd( aa_tmp.v, c03_0.v );
  //c03_1.v  = _mm256_mul_pd( aa_tmp.v, c03_1.v );
  //c03_2.v  = _mm256_mul_pd( aa_tmp.v, c03_2.v );
  //c03_3.v  = _mm256_mul_pd( aa_tmp.v, c03_3.v );
  //c47_0.v  = _mm256_mul_pd( aa_tmp.v, c47_0.v );
  //c47_1.v  = _mm256_mul_pd( aa_tmp.v, c47_1.v );
  //c47_2.v  = _mm256_mul_pd( aa_tmp.v, c47_2.v );
  //c47_3.v  = _mm256_mul_pd( aa_tmp.v, c47_3.v );
  //
  c03_0.v  = _mm256_mul_pd( aa_tmp.v, c03_0.v );
  c03_1.v  = _mm256_mul_pd( aa_tmp.v, c03_1.v );
  c03_2.v  = _mm256_mul_pd( aa_tmp.v, c03_2.v );
  c03_3.v  = _mm256_mul_pd( aa_tmp.v, c03_3.v );
  c47_0.v  = _mm256_mul_pd( aa_tmp.v, c47_0.v );
  c47_1.v  = _mm256_mul_pd( aa_tmp.v, c47_1.v );
  c47_2.v  = _mm256_mul_pd( aa_tmp.v, c47_2.v );
  c47_3.v  = _mm256_mul_pd( aa_tmp.v, c47_3.v );


  //printf( "scale -2 \n" );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] );


  aa_tmp.v = _mm256_load_pd( (double*)aa );
  c03_0.v  = _mm256_add_pd( aa_tmp.v, c03_0.v );
  c03_1.v  = _mm256_add_pd( aa_tmp.v, c03_1.v );
  c03_2.v  = _mm256_add_pd( aa_tmp.v, c03_2.v );
  c03_3.v  = _mm256_add_pd( aa_tmp.v, c03_3.v );

  //printf( "aa03 = %lf, %lf, %lf, %lf\n", aa_tmp.d[0], aa_tmp.d[1], aa_tmp.d[2], aa_tmp.d[3] );
  //printf( "bb03 = %lf, %lf, %lf, %lf\n", bb[ 0 ], bb[ 1 ], bb[ 2 ], bb[ 3 ] );

  aa_tmp.v = _mm256_load_pd( (double*)( aa + 4 ) );
  c47_0.v  = _mm256_add_pd( aa_tmp.v, c47_0.v );
  c47_1.v  = _mm256_add_pd( aa_tmp.v, c47_1.v );
  c47_2.v  = _mm256_add_pd( aa_tmp.v, c47_2.v );
  c47_3.v  = _mm256_add_pd( aa_tmp.v, c47_3.v );
  

  //printf( "add a^2\n" );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] );


  // Prefetch u
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( u ) );


  bb_tmp.v = _mm256_broadcast_sd( (double*)bb );
  c03_0.v  = _mm256_add_pd( bb_tmp.v, c03_0.v );
  c47_0.v  = _mm256_add_pd( bb_tmp.v, c47_0.v );

  bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 1 ) );
  c03_1.v  = _mm256_add_pd( bb_tmp.v, c03_1.v );
  c47_1.v  = _mm256_add_pd( bb_tmp.v, c47_1.v );

  bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 2 ) );
  c03_2.v  = _mm256_add_pd( bb_tmp.v, c03_2.v );
  c47_2.v  = _mm256_add_pd( bb_tmp.v, c47_2.v );

  bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 3 ) );
  c03_3.v  = _mm256_add_pd( bb_tmp.v, c03_3.v );
  c47_3.v  = _mm256_add_pd( bb_tmp.v, c47_3.v );



  // Check if there is any illegle value 
  c_tmp.v  = _mm256_broadcast_sd( &dzero );
  c03_0.v  = _mm256_max_pd( c_tmp.v, c03_0.v );
  c03_1.v  = _mm256_max_pd( c_tmp.v, c03_1.v );
  c03_2.v  = _mm256_max_pd( c_tmp.v, c03_2.v );
  c03_3.v  = _mm256_max_pd( c_tmp.v, c03_3.v );
  c47_0.v  = _mm256_max_pd( c_tmp.v, c47_0.v );
  c47_1.v  = _mm256_max_pd( c_tmp.v, c47_1.v );
  c47_2.v  = _mm256_max_pd( c_tmp.v, c47_2.v );
  c47_3.v  = _mm256_max_pd( c_tmp.v, c47_3.v );



  // Scale before the kernel evaluation
  aa_tmp.v = _mm256_broadcast_sd( &alpha );
  c03_0.v  = _mm256_mul_pd( aa_tmp.v, c03_0.v );
  c03_1.v  = _mm256_mul_pd( aa_tmp.v, c03_1.v );
  c03_2.v  = _mm256_mul_pd( aa_tmp.v, c03_2.v );
  c03_3.v  = _mm256_mul_pd( aa_tmp.v, c03_3.v );
  c47_0.v  = _mm256_mul_pd( aa_tmp.v, c47_0.v );
  c47_1.v  = _mm256_mul_pd( aa_tmp.v, c47_1.v );
  c47_2.v  = _mm256_mul_pd( aa_tmp.v, c47_2.v );
  c47_3.v  = _mm256_mul_pd( aa_tmp.v, c47_3.v );


  // Preload u03, u47
  u03.v    = _mm256_load_pd( (double*)u );
  u47.v    = _mm256_load_pd( (double*)( u + 4 ) );

  // Prefetch w
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( w ) );


  #include "ks_exp_int_d8x4.h"

  //printf( "square distance\n" );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] );

  //for ( i = 0; i < 4; i++ ) {
  //  if ( c03_0.d[ i ] != c03_0.d[ i ] ) {
  //    printf( "error Nan: c03_0[ %d ]\n", i );
  //  }
  //  if ( c03_1.d[ i ] != c03_1.d[ i ] ) {
  //    printf( "error Nan: c03_1[ %d ]\n", i );
  //  }
  //  if ( c03_2.d[ i ] != c03_2.d[ i ] ) {
  //    printf( "error Nan: c03_2[ %d ]\n", i );
  //  }
  //  if ( c03_3.d[ i ] != c03_3.d[ i ] ) {
  //    printf( "error Nan: c03_3[ %d ]\n", i );
  //  }
  //  if ( c47_0.d[ i ] != c47_0.d[ i ] ) {
  //    printf( "error Nan: c47_0[ %d ]\n", i );
  //  }
  //  if ( c47_1.d[ i ] != c47_1.d[ i ] ) {
  //    printf( "error Nan: c47_1[ %d ]\n", i );
  //  }
  //  if ( c47_2.d[ i ] != c47_2.d[ i ] ) {
  //    printf( "error Nan: c47_2[ %d ]\n", i );
  //  }
  //  if ( c47_3.d[ i ] != c47_3.d[ i ] ) {
  //    printf( "error Nan: c47_3[ %d ]\n", i );
  //  }
  //}



//  tmp.v     = _mm256_broadcast_sd( &maxlog );
//  c03_0.v   = _mm256_min_pd( tmp.v, c03_0.v ); 
//  c03_1.v   = _mm256_min_pd( tmp.v, c03_1.v ); 
//  c03_2.v   = _mm256_min_pd( tmp.v, c03_2.v ); 
//  c03_3.v   = _mm256_min_pd( tmp.v, c03_3.v ); 
//  c47_0.v   = _mm256_min_pd( tmp.v, c47_0.v ); 
//  c47_1.v   = _mm256_min_pd( tmp.v, c47_1.v ); 
//  c47_2.v   = _mm256_min_pd( tmp.v, c47_2.v ); 
//  c47_3.v   = _mm256_min_pd( tmp.v, c47_3.v ); 
//  tmp.v     = _mm256_broadcast_sd( &minlog );
//  c03_0.v   = _mm256_max_pd( tmp.v, c03_0.v ); 
//  c03_1.v   = _mm256_max_pd( tmp.v, c03_1.v ); 
//  c03_2.v   = _mm256_max_pd( tmp.v, c03_2.v ); 
//  c03_3.v   = _mm256_max_pd( tmp.v, c03_3.v ); 
//  c47_0.v   = _mm256_max_pd( tmp.v, c47_0.v ); 
//  c47_1.v   = _mm256_max_pd( tmp.v, c47_1.v ); 
//  c47_2.v   = _mm256_max_pd( tmp.v, c47_2.v ); 
//  c47_3.v   = _mm256_max_pd( tmp.v, c47_3.v ); 
//
//  // a = c / log2e
//  // c = a * ln2 = k * ln2 + w, ( w in [ -ln2, ln2 ] )
//  l2e.v         = _mm256_broadcast_sd( &log2e );
//  a03_0.v       = _mm256_mul_pd( l2e.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( l2e.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( l2e.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( l2e.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( l2e.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( l2e.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( l2e.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( l2e.v, c47_3.v );
//
//  // Check if a < 0 
//  tmp.v         = _mm256_setzero_pd();
//  p03_0.v       = _mm256_cmp_pd( a03_0.v, tmp.v, 1 );
//  p03_1.v       = _mm256_cmp_pd( a03_1.v, tmp.v, 1 );
//  p03_2.v       = _mm256_cmp_pd( a03_2.v, tmp.v, 1 );
//  p03_3.v       = _mm256_cmp_pd( a03_3.v, tmp.v, 1 );
//  p47_0.v       = _mm256_cmp_pd( a47_0.v, tmp.v, 1 );
//  p47_1.v       = _mm256_cmp_pd( a47_1.v, tmp.v, 1 );
//  p47_2.v       = _mm256_cmp_pd( a47_2.v, tmp.v, 1 );
//  p47_3.v       = _mm256_cmp_pd( a47_3.v, tmp.v, 1 );
//  tmp.v         = _mm256_broadcast_sd( &one );
//  p03_0.v       = _mm256_and_pd( tmp.v, p03_0.v );
//  p03_1.v       = _mm256_and_pd( tmp.v, p03_1.v );
//  p03_2.v       = _mm256_and_pd( tmp.v, p03_2.v );
//  p03_3.v       = _mm256_and_pd( tmp.v, p03_3.v );
//  p47_0.v       = _mm256_and_pd( tmp.v, p47_0.v );
//  p47_1.v       = _mm256_and_pd( tmp.v, p47_1.v );
//  p47_2.v       = _mm256_and_pd( tmp.v, p47_2.v );
//  p47_3.v       = _mm256_and_pd( tmp.v, p47_3.v );
//  // If a < 0 ( w < 0 ), then a - 1 =  ( k - 1 ) + w / ln2 
//  a03_0.v       = _mm256_sub_pd( a03_0.v, p03_0.v );
//  a03_1.v       = _mm256_sub_pd( a03_1.v, p03_1.v );
//  a03_2.v       = _mm256_sub_pd( a03_2.v, p03_2.v );
//  a03_3.v       = _mm256_sub_pd( a03_3.v, p03_3.v );
//  a47_0.v       = _mm256_sub_pd( a47_0.v, p47_0.v );
//  a47_1.v       = _mm256_sub_pd( a47_1.v, p47_1.v );
//  a47_2.v       = _mm256_sub_pd( a47_2.v, p47_2.v );
//  a47_3.v       = _mm256_sub_pd( a47_3.v, p47_3.v );
//  // Compute floor( a ) by two conversions
//  // if a < 0, p = k - 1
//  // else    , p = k
//  k03_0.v       = _mm256_cvttpd_epi32( a03_0.v );
//  k03_1.v       = _mm256_cvttpd_epi32( a03_1.v );
//  k03_2.v       = _mm256_cvttpd_epi32( a03_2.v );
//  k03_3.v       = _mm256_cvttpd_epi32( a03_3.v );
//  k47_0.v       = _mm256_cvttpd_epi32( a47_0.v );
//  k47_1.v       = _mm256_cvttpd_epi32( a47_1.v );
//  k47_2.v       = _mm256_cvttpd_epi32( a47_2.v );
//  k47_3.v       = _mm256_cvttpd_epi32( a47_3.v );
//  p03_0.v       = _mm256_cvtepi32_pd( k03_0.v );
//  p03_1.v       = _mm256_cvtepi32_pd( k03_1.v );
//  p03_2.v       = _mm256_cvtepi32_pd( k03_2.v );
//  p03_3.v       = _mm256_cvtepi32_pd( k03_3.v );
//  p47_0.v       = _mm256_cvtepi32_pd( k47_0.v );
//  p47_1.v       = _mm256_cvtepi32_pd( k47_1.v );
//  p47_2.v       = _mm256_cvtepi32_pd( k47_2.v );
//  p47_3.v       = _mm256_cvtepi32_pd( k47_3.v );
//
//  // ---------------------
//  // x -= p * ln2
//  // ---------------------
//  // c1 = ln2
//  // if a < 0, a = ( k - 1 ) * ln2
//  // else    , a = k * ln2
//  // if a < 0, x -= ( k - 1 ) * ln2
//  // else    , x -= k * ln2
//  //
//  tmp.v         = _mm256_broadcast_sd( &c1 );
//  a03_0.v       = _mm256_mul_pd( tmp.v, p03_0.v );
//  a03_1.v       = _mm256_mul_pd( tmp.v, p03_1.v );
//  a03_2.v       = _mm256_mul_pd( tmp.v, p03_2.v );
//  a03_3.v       = _mm256_mul_pd( tmp.v, p03_3.v );
//  a47_0.v       = _mm256_mul_pd( tmp.v, p47_0.v );
//  a47_1.v       = _mm256_mul_pd( tmp.v, p47_1.v );
//  a47_2.v       = _mm256_mul_pd( tmp.v, p47_2.v );
//  a47_3.v       = _mm256_mul_pd( tmp.v, p47_3.v );
//  c03_0.v       = _mm256_sub_pd( c03_0.v, a03_0.v );
//  c03_1.v       = _mm256_sub_pd( c03_1.v, a03_1.v );
//  c03_2.v       = _mm256_sub_pd( c03_2.v, a03_2.v );
//  c03_3.v       = _mm256_sub_pd( c03_3.v, a03_3.v );
//  c47_0.v       = _mm256_sub_pd( c47_0.v, a47_0.v );
//  c47_1.v       = _mm256_sub_pd( c47_1.v, a47_1.v );
//  c47_2.v       = _mm256_sub_pd( c47_2.v, a47_2.v );
//  c47_3.v       = _mm256_sub_pd( c47_3.v, a47_3.v );
//  tmp.v         = _mm256_broadcast_sd( &c2 );
//  a03_0.v       = _mm256_mul_pd( tmp.v, p03_0.v );
//  a03_1.v       = _mm256_mul_pd( tmp.v, p03_1.v );
//  a03_2.v       = _mm256_mul_pd( tmp.v, p03_2.v );
//  a03_3.v       = _mm256_mul_pd( tmp.v, p03_3.v );
//  a47_0.v       = _mm256_mul_pd( tmp.v, p47_0.v );
//  a47_1.v       = _mm256_mul_pd( tmp.v, p47_1.v );
//  a47_2.v       = _mm256_mul_pd( tmp.v, p47_2.v );
//  a47_3.v       = _mm256_mul_pd( tmp.v, p47_3.v );
//  c03_0.v       = _mm256_sub_pd( c03_0.v, a03_0.v );
//  c03_1.v       = _mm256_sub_pd( c03_1.v, a03_1.v );
//  c03_2.v       = _mm256_sub_pd( c03_2.v, a03_2.v );
//  c03_3.v       = _mm256_sub_pd( c03_3.v, a03_3.v );
//  c47_0.v       = _mm256_sub_pd( c47_0.v, a47_0.v );
//  c47_1.v       = _mm256_sub_pd( c47_1.v, a47_1.v );
//  c47_2.v       = _mm256_sub_pd( c47_2.v, a47_2.v );
//  c47_3.v       = _mm256_sub_pd( c47_3.v, a47_3.v );
//
//
//  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] );
//  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] );
//  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] );
//  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] );
//  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] );
//  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] );
//  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] );
//  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] );
//
//
//  // Prefetch u
//  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( u ) );
//
//
//
//  // Compute e^x using polynomial approximation
//  // a = w10 + w11 * x
//  tmp.v         = _mm256_broadcast_sd( &w11 );
//  //tmp.v         = _mm256_broadcast_sd( &w9 );
//  a03_0.v       = _mm256_mul_pd( c03_0.v, tmp.v );
//  a03_1.v       = _mm256_mul_pd( c03_1.v, tmp.v );
//  a03_2.v       = _mm256_mul_pd( c03_2.v, tmp.v );
//  a03_3.v       = _mm256_mul_pd( c03_3.v, tmp.v );
//  a47_0.v       = _mm256_mul_pd( c47_0.v, tmp.v );
//  a47_1.v       = _mm256_mul_pd( c47_1.v, tmp.v );
//  a47_2.v       = _mm256_mul_pd( c47_2.v, tmp.v );
//  a47_3.v       = _mm256_mul_pd( c47_3.v, tmp.v );
//  tmp.v         = _mm256_broadcast_sd( &w10 );
//  //tmp.v         = _mm256_broadcast_sd( &w8 );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//
//
//  // a = w8 + ( w9 + ( w10 + w11 * x ) * x ) * x
//  tmp.v         = _mm256_broadcast_sd( &w9 );
//  a03_0.v       = _mm256_mul_pd( a03_0.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( a03_1.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( a03_2.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( a03_3.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( a47_0.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( a47_1.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( a47_2.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( a47_3.v, c47_3.v );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//  tmp.v         = _mm256_broadcast_sd( &w8 );
//  a03_0.v       = _mm256_mul_pd( a03_0.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( a03_1.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( a03_2.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( a03_3.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( a47_0.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( a47_1.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( a47_2.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( a47_3.v, c47_3.v );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//
//
//  tmp.v         = _mm256_broadcast_sd( &w7 );
//  a03_0.v       = _mm256_mul_pd( a03_0.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( a03_1.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( a03_2.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( a03_3.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( a47_0.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( a47_1.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( a47_2.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( a47_3.v, c47_3.v );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//  tmp.v         = _mm256_broadcast_sd( &w6 );
//  a03_0.v       = _mm256_mul_pd( a03_0.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( a03_1.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( a03_2.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( a03_3.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( a47_0.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( a47_1.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( a47_2.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( a47_3.v, c47_3.v );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//
//
//  tmp.v         = _mm256_broadcast_sd( &w5 );
//  a03_0.v       = _mm256_mul_pd( a03_0.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( a03_1.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( a03_2.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( a03_3.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( a47_0.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( a47_1.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( a47_2.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( a47_3.v, c47_3.v );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//  tmp.v         = _mm256_broadcast_sd( &w4 );
//  a03_0.v       = _mm256_mul_pd( a03_0.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( a03_1.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( a03_2.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( a03_3.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( a47_0.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( a47_1.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( a47_2.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( a47_3.v, c47_3.v );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//
//
//  // Prefetch w
//  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( w ) );
//  // Preload u03
//  u03.v    = _mm256_load_pd( (double*)u );
//
//
//  tmp.v         = _mm256_broadcast_sd( &w3 );
//  a03_0.v       = _mm256_mul_pd( a03_0.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( a03_1.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( a03_2.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( a03_3.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( a47_0.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( a47_1.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( a47_2.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( a47_3.v, c47_3.v );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//  tmp.v         = _mm256_broadcast_sd( &w2 );
//  a03_0.v       = _mm256_mul_pd( a03_0.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( a03_1.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( a03_2.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( a03_3.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( a47_0.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( a47_1.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( a47_2.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( a47_3.v, c47_3.v );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//
//
//  tmp.v         = _mm256_broadcast_sd( &w1 );
//  a03_0.v       = _mm256_mul_pd( a03_0.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( a03_1.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( a03_2.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( a03_3.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( a47_0.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( a47_1.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( a47_2.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( a47_3.v, c47_3.v );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//  tmp.v         = _mm256_broadcast_sd( &w0 );
//  a03_0.v       = _mm256_mul_pd( a03_0.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( a03_1.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( a03_2.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( a03_3.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( a47_0.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( a47_1.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( a47_2.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( a47_3.v, c47_3.v );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//
//
//  // Preload u47
//  u47.v    = _mm256_load_pd( (double*)( u + 4 ) );
//
//
//  offset.v      = _mm_setr_epi32( 1023, 1023, 0, 0 );
//  k1.v          = _mm_set_epi32( 0, 0, k03_0.d[ 1 ], k03_0.d[ 0 ]);
//  k2.v          = _mm_set_epi32( 0, 0, k03_0.d[ 3 ], k03_0.d[ 2 ]);
//  k1.v          = _mm_add_epi32( k1.v, offset.v );
//  k2.v          = _mm_add_epi32( k2.v, offset.v );
//  k1.v          = _mm_slli_epi32( k1.v, 20 );
//  k2.v          = _mm_slli_epi32( k2.v, 20 );
//  k1.v          = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  k2.v          = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  p1            = _mm_castsi128_pd( k1.v );
//  p2            = _mm_castsi128_pd( k2.v );
//  p03_0.v       = _mm256_set_m128d( p2, p1 );
//  k1.v          = _mm_set_epi32( 0, 0, k03_1.d[ 1 ], k03_1.d[ 0 ]);
//  k2.v          = _mm_set_epi32( 0, 0, k03_1.d[ 3 ], k03_1.d[ 2 ]);
//  k1.v          = _mm_add_epi32( k1.v, offset.v );
//  k2.v          = _mm_add_epi32( k2.v, offset.v );
//  k1.v          = _mm_slli_epi32( k1.v, 20 );
//  k2.v          = _mm_slli_epi32( k2.v, 20 );
//  k1.v          = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  k2.v          = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  p1            = _mm_castsi128_pd( k1.v );
//  p2            = _mm_castsi128_pd( k2.v );
//  p03_1.v       = _mm256_set_m128d( p2, p1 );
//  k1.v          = _mm_set_epi32( 0, 0, k03_2.d[ 1 ], k03_2.d[ 0 ]);
//  k2.v          = _mm_set_epi32( 0, 0, k03_2.d[ 3 ], k03_2.d[ 2 ]);
//  k1.v          = _mm_add_epi32( k1.v, offset.v );
//  k2.v          = _mm_add_epi32( k2.v, offset.v );
//  k1.v          = _mm_slli_epi32( k1.v, 20 );
//  k2.v          = _mm_slli_epi32( k2.v, 20 );
//  k1.v          = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  k2.v          = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  p1            = _mm_castsi128_pd( k1.v );
//  p2            = _mm_castsi128_pd( k2.v );
//  p03_2.v       = _mm256_set_m128d( p2, p1 );
//  k1.v          = _mm_set_epi32( 0, 0, k03_3.d[ 1 ], k03_3.d[ 0 ]);
//  k2.v          = _mm_set_epi32( 0, 0, k03_3.d[ 3 ], k03_3.d[ 2 ]);
//  k1.v          = _mm_add_epi32( k1.v, offset.v );
//  k2.v          = _mm_add_epi32( k2.v, offset.v );
//  k1.v          = _mm_slli_epi32( k1.v, 20 );
//  k2.v          = _mm_slli_epi32( k2.v, 20 );
//  k1.v          = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  k2.v          = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  p1            = _mm_castsi128_pd( k1.v );
//  p2            = _mm_castsi128_pd( k2.v );
//  p03_3.v       = _mm256_set_m128d( p2, p1 );
//  k1.v          = _mm_set_epi32( 0, 0, k47_0.d[ 1 ], k47_0.d[ 0 ]);
//  k2.v          = _mm_set_epi32( 0, 0, k47_0.d[ 3 ], k47_0.d[ 2 ]);
//  k1.v          = _mm_add_epi32( k1.v, offset.v );
//  k2.v          = _mm_add_epi32( k2.v, offset.v );
//  k1.v          = _mm_slli_epi32( k1.v, 20 );
//  k2.v          = _mm_slli_epi32( k2.v, 20 );
//  k1.v          = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  k2.v          = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  p1            = _mm_castsi128_pd( k1.v );
//  p2            = _mm_castsi128_pd( k2.v );
//  p47_0.v       = _mm256_set_m128d( p2, p1 );
//  k1.v          = _mm_set_epi32( 0, 0, k47_1.d[ 1 ], k47_1.d[ 0 ]);
//  k2.v          = _mm_set_epi32( 0, 0, k47_1.d[ 3 ], k47_1.d[ 2 ]);
//  k1.v          = _mm_add_epi32( k1.v, offset.v );
//  k2.v          = _mm_add_epi32( k2.v, offset.v );
//  k1.v          = _mm_slli_epi32( k1.v, 20 );
//  k2.v          = _mm_slli_epi32( k2.v, 20 );
//  k1.v          = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  k2.v          = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  p1            = _mm_castsi128_pd( k1.v );
//  p2            = _mm_castsi128_pd( k2.v );
//  p47_1.v       = _mm256_set_m128d( p2, p1 );
//  k1.v          = _mm_set_epi32( 0, 0, k47_2.d[ 1 ], k47_2.d[ 0 ]);
//  k2.v          = _mm_set_epi32( 0, 0, k47_2.d[ 3 ], k47_2.d[ 2 ]);
//  k1.v          = _mm_add_epi32( k1.v, offset.v );
//  k2.v          = _mm_add_epi32( k2.v, offset.v );
//  k1.v          = _mm_slli_epi32( k1.v, 20 );
//  k2.v          = _mm_slli_epi32( k2.v, 20 );
//  k1.v          = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  k2.v          = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  p1            = _mm_castsi128_pd( k1.v );
//  p2            = _mm_castsi128_pd( k2.v );
//  p47_2.v       = _mm256_set_m128d( p2, p1 );
//  k1.v          = _mm_set_epi32( 0, 0, k47_3.d[ 1 ], k47_3.d[ 0 ]);
//  k2.v          = _mm_set_epi32( 0, 0, k47_3.d[ 3 ], k47_3.d[ 2 ]);
//  k1.v          = _mm_add_epi32( k1.v, offset.v );
//  k2.v          = _mm_add_epi32( k2.v, offset.v );
//  k1.v          = _mm_slli_epi32( k1.v, 20 );
//  k2.v          = _mm_slli_epi32( k2.v, 20 );
//  k1.v          = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  k2.v          = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  p1            = _mm_castsi128_pd( k1.v );
//  p2            = _mm_castsi128_pd( k2.v );
//  p47_3.v       = _mm256_set_m128d( p2, p1 );
//  
// 
//  //u03.v    = _mm256_load_pd( (double*)u );
//  //u47.v    = _mm256_load_pd( (double*)( u + 4 ) );
//
//
//  c03_0.v       = _mm256_mul_pd( a03_0.v, p03_0.v );
//  c03_1.v       = _mm256_mul_pd( a03_1.v, p03_1.v );
//  c03_2.v       = _mm256_mul_pd( a03_2.v, p03_2.v );
//  c03_3.v       = _mm256_mul_pd( a03_3.v, p03_3.v );
//  c47_0.v       = _mm256_mul_pd( a47_0.v, p47_0.v );
//  c47_1.v       = _mm256_mul_pd( a47_1.v, p47_1.v );
//  c47_2.v       = _mm256_mul_pd( a47_2.v, p47_2.v );
//  c47_3.v       = _mm256_mul_pd( a47_3.v, p47_3.v );



  //for ( i = 0; i < 4; i++ ) {
  //  if ( c03_0.d[ i ] != c03_0.d[ i ] ) {
  //    printf( "error exp Nan: c03_0[ %d ]\n", i );
  //  }
  //  if ( c03_1.d[ i ] != c03_1.d[ i ] ) {
  //    printf( "error exp Nan: c03_1[ %d ]\n", i );
  //  }
  //  if ( c03_2.d[ i ] != c03_2.d[ i ] ) {
  //    printf( "error exp Nan: c03_2[ %d ]\n", i );
  //  }
  //  if ( c03_3.d[ i ] != c03_3.d[ i ] ) {
  //    printf( "error exp Nan: c03_3[ %d ]\n", i );
  //  }
  //  if ( c47_0.d[ i ] != c47_0.d[ i ] ) {
  //    printf( "error exp Nan: c47_0[ %d ]\n", i );
  //  }
  //  if ( c47_1.d[ i ] != c47_1.d[ i ] ) {
  //    printf( "error exp Nan: c47_1[ %d ]\n", i );
  //  }
  //  if ( c47_2.d[ i ] != c47_2.d[ i ] ) {
  //    printf( "error exp Nan: c47_2[ %d ]\n", i );
  //  }
  //  if ( c47_3.d[ i ] != c47_3.d[ i ] ) {
  //    printf( "error exp Nan: c47_3[ %d ]\n", i );
  //  }
  //}




  //printf( "exp\n" );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] );

  //printf( "w\n" );
  //printf( "%lf, %lf, %lf, %lf\n", w[0], w[3], w[3], w[3] );


  //u03.v    = _mm256_load_pd( (double*)u );
  //u47.v    = _mm256_load_pd( (double*)( u + 4 ) );

  w_tmp.v  = _mm256_broadcast_sd( (double*)w );
  c03_0.v  = _mm256_mul_pd( w_tmp.v, c03_0.v );
  c47_0.v  = _mm256_mul_pd( w_tmp.v, c47_0.v );
  u03.v    = _mm256_add_pd( u03.v, c03_0.v );
  u47.v    = _mm256_add_pd( u47.v, c47_0.v );
 

  //for ( i = 0; i < 4; i++ ) {
  //  if ( w_tmp.d[ i ] != w_tmp.d[ i ] ) {
  //    printf( "error w_tmp Nan: w_tmp[ %d ]\n", i );
  //  }
  //}


  w_tmp.v  = _mm256_broadcast_sd( (double*)( w + 1 ) );
  c03_1.v  = _mm256_mul_pd( w_tmp.v, c03_1.v );
  c47_1.v  = _mm256_mul_pd( w_tmp.v, c47_1.v );
  u03.v    = _mm256_add_pd( u03.v, c03_1.v );
  u47.v    = _mm256_add_pd( u47.v, c47_1.v );


  //for ( i = 0; i < 4; i++ ) {
  //  if ( w_tmp.d[ i ] != w_tmp.d[ i ] ) {
  //    printf( "error w_tmp Nan: w_tmp[ %d ]\n", i );
  //  }
  //}

  w_tmp.v  = _mm256_broadcast_sd( (double*)( w + 2 ) );
  c03_2.v  = _mm256_mul_pd( w_tmp.v, c03_2.v );
  c47_2.v  = _mm256_mul_pd( w_tmp.v, c47_2.v );
  u03.v    = _mm256_add_pd( u03.v, c03_2.v );
  u47.v    = _mm256_add_pd( u47.v, c47_2.v );


  //for ( i = 0; i < 4; i++ ) {
  //  if ( w_tmp.d[ i ] != w_tmp.d[ i ] ) {
  //    printf( "error w_tmp Nan: w_tmp[ %d ]\n", i );
  //  }
  //}

  w_tmp.v  = _mm256_broadcast_sd( (double*)( w + 3 ) );
  c03_3.v  = _mm256_mul_pd( w_tmp.v, c03_3.v );
  c47_3.v  = _mm256_mul_pd( w_tmp.v, c47_3.v );
  u03.v    = _mm256_add_pd( u03.v, c03_3.v );
  u47.v    = _mm256_add_pd( u47.v, c47_3.v );


  //for ( i = 0; i < 4; i++ ) {
  //  if ( w_tmp.d[ i ] != w_tmp.d[ i ] ) {
  //    printf( "error w_tmp Nan: w_tmp[ %d ]\n", i );
  //  }
  //}



  _mm256_store_pd( (double*)u, u03.v );
  _mm256_store_pd( (double*)( u + 4 ), u47.v );


  //for ( i = 0; i < 4; i++ ) {
  //  if ( c03_0.d[ i ] != c03_0.d[ i ] ) {
  //    printf( "error gemv Nan: c03_0[ %d ]\n", i );
  //    exit( 1 );
  //  }
  //  if ( c03_1.d[ i ] != c03_1.d[ i ] ) {
  //    printf( "error gemv Nan: c03_1[ %d ]\n", i );
  //    exit( 1 );
  //  }
  //  if ( c03_2.d[ i ] != c03_2.d[ i ] ) {
  //    printf( "error gemv Nan: c03_2[ %d ]\n", i );
  //    exit( 1 );
  //  }
  //  if ( c03_3.d[ i ] != c03_3.d[ i ] ) {
  //    printf( "error gemv Nan: c03_3[ %d ]\n", i );
  //    exit( 1 );
  //  }
  //  if ( c47_0.d[ i ] != c47_0.d[ i ] ) {
  //    printf( "error gemv Nan: c47_0[ %d ]\n", i );
  //    exit( 1 );
  //  }
  //  if ( c47_1.d[ i ] != c47_1.d[ i ] ) {
  //    printf( "error gemv Nan: c47_1[ %d ]\n", i );
  //    exit( 1 );
  //  }
  //  if ( c47_2.d[ i ] != c47_2.d[ i ] ) {
  //    printf( "error gemv Nan: c47_2[ %d ]\n", i );
  //    exit( 1 );
  //  }
  //  if ( c47_3.d[ i ] != c47_3.d[ i ] ) {
  //    printf( "error gemv Nan: c47_3[ %d ]\n", i );
  //    exit( 1 );
  //  }
  //}


  //for ( i = 0; i < 4; i ++ ) {
  //  if ( w[ i ] != w[ i ] ) {
  //    printf( "GSKS error w Nan: w03[ %d ]\n", i );
  //  }
  //}


  //for ( i = 0; i < 4; i++ ) {
  //  if ( u03.d[ i ] != u03.d[ i ] ) {
  //    printf( "GSKS error u Nan: u03[ %d ]\n", i );
  //  }
  //  if ( u47.d[ i ] != u47.d[ i ] ) {
  //    printf( "GSKS error u Nan: u47[ %d ]\n", i );
  //  }
  //}



  //printf( "%lf\n", u03.d[0] );
  //printf( "%lf\n", u03.d[1] );
  //printf( "%lf\n", u03.d[2] );
  //printf( "%lf\n", u03.d[3] );
  //printf( "%lf\n", u47.d[0] );
  //printf( "%lf\n", u47.d[1] );
  //printf( "%lf\n", u47.d[2] );
  //printf( "%lf\n", u47.d[3] );
}
Exemple #15
0
// it moves horizontally inside a block
void kernel_dgemv_n_4_lib4(int kmax, double *A, double *x, double *y, int alg)
	{
	if(kmax<=0) 
		return;
	
	const int lda = 4;
	
	int k;

	__m256d
		ax_temp,
		a_00_10_20_30, a_01_11_21_31, a_02_12_22_32, a_03_13_23_33,
		x_0, x_1, x_2, x_3,
		y_0_1_2_3, y_0_1_2_3_b, y_0_1_2_3_c, y_0_1_2_3_d, z_0_1_2_3;
	
	y_0_1_2_3   = _mm256_setzero_pd();	
	y_0_1_2_3_b = _mm256_setzero_pd();	
	y_0_1_2_3_c = _mm256_setzero_pd();	
	y_0_1_2_3_d = _mm256_setzero_pd();	

	k=0;
	for(; k<kmax-3; k+=4)
		{

		x_0 = _mm256_broadcast_sd( &x[0] );
		x_1 = _mm256_broadcast_sd( &x[1] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );

		x_2 = _mm256_broadcast_sd( &x[2] );
		x_3 = _mm256_broadcast_sd( &x[3] );

		a_02_12_22_32 = _mm256_load_pd( &A[0+lda*2] );
		a_03_13_23_33 = _mm256_load_pd( &A[0+lda*3] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
		y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );

		ax_temp = _mm256_mul_pd( a_02_12_22_32, x_2 );
		y_0_1_2_3_c = _mm256_add_pd( y_0_1_2_3_c, ax_temp );
		ax_temp = _mm256_mul_pd( a_03_13_23_33, x_3 );
		y_0_1_2_3_d = _mm256_add_pd( y_0_1_2_3_d, ax_temp );
		
		A += 4*lda;
		x += 4;

		}
	
	y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_c );
	y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, y_0_1_2_3_d );

	if(kmax%4>=2)
		{

		x_0 = _mm256_broadcast_sd( &x[0] );
		x_1 = _mm256_broadcast_sd( &x[1] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );
		a_01_11_21_31 = _mm256_load_pd( &A[0+lda*1] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		ax_temp = _mm256_mul_pd( a_01_11_21_31, x_1 );
		y_0_1_2_3_b = _mm256_add_pd( y_0_1_2_3_b, ax_temp );

		A += 2*lda;
		x += 2;

		}

	y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, y_0_1_2_3_b );

	if(kmax%2==1)
		{

		x_0 = _mm256_broadcast_sd( &x[0] );

		a_00_10_20_30 = _mm256_load_pd( &A[0+lda*0] );

		ax_temp = _mm256_mul_pd( a_00_10_20_30, x_0 );
		y_0_1_2_3 = _mm256_add_pd( y_0_1_2_3, ax_temp );
		
/*		A += 1*lda;*/
/*		x += 1;*/

		}

	if(alg==0)
		{
		_mm256_storeu_pd(&y[0], y_0_1_2_3);
		}
	else if(alg==1)
		{
		z_0_1_2_3 = _mm256_loadu_pd( &y[0] );

		z_0_1_2_3 = _mm256_add_pd ( z_0_1_2_3, y_0_1_2_3 );

		_mm256_storeu_pd(&y[0], z_0_1_2_3);
		}
	else // alg==-1
		{
		z_0_1_2_3 = _mm256_loadu_pd( &y[0] );

		z_0_1_2_3 = _mm256_sub_pd ( z_0_1_2_3, y_0_1_2_3 );

		_mm256_storeu_pd(&y[0], z_0_1_2_3);
		}

	}
Exemple #16
0
    // ymm00 = _mm256_add_pd(ymm00, ymm01); ymm02 = _mm256_add_pd(ymm02, ymm03);
    // ymm04 = _mm256_add_pd(ymm04, ymm05); ymm06 = _mm256_add_pd(ymm06, ymm07);
    // ymm00 = _mm256_add_pd(ymm00, ymm02); ymm04 = _mm256_add_pd(ymm04, ymm06);
    // ymm00 = _mm256_add_pd(ymm00, ymm04);// ymm00 now holds the left half
    // // add up right half
    // ymm08 = _mm256_add_pd(ymm08, ymm09); ymm10 = _mm256_add_pd(ymm10, ymm11);
    // ymm12 = _mm256_add_pd(ymm12, ymm13); ymm14 = _mm256_add_pd(ymm14, ymm15);
    // ymm08 = _mm256_add_pd(ymm08, ymm10); ymm12 = _mm256_add_pd(ymm12, ymm14);
    // ymm08 = _mm256_add_pd(ymm08, ymm12);// ymm08 holds right half

    // // ym00 and ym08 now hold the left and right halves, store back in C
    // _mm256_store_pd((double *) (C + row*8), ymm00); _mm256_store_pd((double *) (C + row*8 + 4), ymm08);

    // Broadcast each element of matrix A Row [row] into a ymm register
    // If row = [ a b c d e f g h ], then we need two registers for each
    ymm00 = _mm256_broadcast_sd((double *)(A + row*8 + 0)); ymm01 = _mm256_broadcast_sd((double *)(A + row*8 + 0));// a
    ymm02 = _mm256_broadcast_sd((double *)(A + row*8 + 1)); ymm03 = _mm256_broadcast_sd((double *)(A + row*8 + 1));// b
    ymm04 = _mm256_broadcast_sd((double *)(A + row*8 + 2)); ymm05 = _mm256_broadcast_sd((double *)(A + row*8 + 2));// c
    ymm06 = _mm256_broadcast_sd((double *)(A + row*8 + 3)); ymm07 = _mm256_broadcast_sd((double *)(A + row*8 + 3));// d
    ymm08 = _mm256_broadcast_sd((double *)(A + row*8 + 4)); ymm09 = _mm256_broadcast_sd((double *)(A + row*8 + 4));// e
    ymm10 = _mm256_broadcast_sd((double *)(A + row*8 + 5)); ymm11 = _mm256_broadcast_sd((double *)(A + row*8 + 5));// f
    ymm12 = _mm256_broadcast_sd((double *)(A + row*8 + 6)); ymm13 = _mm256_broadcast_sd((double *)(A + row*8 + 6));// g
    ymm14 = _mm256_broadcast_sd((double *)(A + row*8 + 7)); ymm15 = _mm256_broadcast_sd((double *)(A + row*8 + 7));// h

    // multiply
    // left half
    ymm00 = _mm256_mul_pd(ymm00, ymm16); ymm01 = _mm256_mul_pd(ymm01, ymm17);// row 1
    ymm02 = _mm256_mul_pd(ymm02, ymm18); ymm03 = _mm256_mul_pd(ymm03, ymm19);// row 2
    ymm04 = _mm256_mul_pd(ymm04, ymm20); ymm05 = _mm256_mul_pd(ymm05, ymm21);// row 3
    ymm06 = _mm256_mul_pd(ymm06, ymm22); ymm07 = _mm256_mul_pd(ymm07, ymm23);// row 4
    ymm08 = _mm256_mul_pd(ymm08, ymm24); ymm09 = _mm256_mul_pd(ymm09, ymm25);// row 5
Exemple #17
0
void ks_tanh_int_d8x4(
    int    k,
    int    rhs,
    double *h,  // NOP
    double *u,
    double *aa, // NOP
    double *a,
    double *bb, // NOP
    double *b,
    double *w,
    double *c,
    ks_t   *ker,
    aux_t  *aux
    )
{
  int    i, rhs_left;
  double scal = ker->scal;
  double cons = ker->cons;


  v4df_t    c03_0,    c03_1,    c03_2,    c03_3;
  v4df_t    c47_0,    c47_1,    c47_2,    c47_3;
  v4df_t tmpc03_0, tmpc03_1, tmpc03_2, tmpc03_3;
  v4df_t tmpc47_0, tmpc47_1, tmpc47_2, tmpc47_3;
  v4df_t u03, u47;
  v4df_t a03, a47, A03, A47; // prefetched A 
  v4df_t b0, b1, b2, b3, B0; // prefetched B
  v4df_t c_tmp, aa_tmp, bb_tmp, w_tmp;


  // Rank-k update segment
  #include "ks_rank_k_int_d8x4.h"


  // Accumulate
  if ( aux->pc ) {
    tmpc03_0.v = _mm256_load_pd( (double*)( c      ) );
    c03_0.v    = _mm256_add_pd( tmpc03_0.v, c03_0.v );
    tmpc47_0.v = _mm256_load_pd( (double*)( c + 4  ) );
    c47_0.v    = _mm256_add_pd( tmpc47_0.v, c47_0.v );
    tmpc03_1.v = _mm256_load_pd( (double*)( c + 8  ) );
    c03_1.v    = _mm256_add_pd( tmpc03_1.v, c03_1.v );
    tmpc47_1.v = _mm256_load_pd( (double*)( c + 12 ) );
    c47_1.v    = _mm256_add_pd( tmpc47_1.v, c47_1.v );
    tmpc03_2.v = _mm256_load_pd( (double*)( c + 16 ) );
    c03_2.v    = _mm256_add_pd( tmpc03_2.v, c03_2.v );
    tmpc47_2.v = _mm256_load_pd( (double*)( c + 20 ) );
    c47_2.v    = _mm256_add_pd( tmpc47_2.v, c47_2.v );
    tmpc03_3.v = _mm256_load_pd( (double*)( c + 24 ) );
    c03_3.v    = _mm256_add_pd( tmpc03_3.v, c03_3.v );
    tmpc47_3.v = _mm256_load_pd( (double*)( c + 28 ) );
    c47_3.v    = _mm256_add_pd( tmpc47_3.v, c47_3.v );
  }


  // Scale before the kernel evaluation
  c_tmp.v  = _mm256_broadcast_sd( &scal );
  c03_0.v  = _mm256_mul_pd( c_tmp.v, c03_0.v );
  c03_1.v  = _mm256_mul_pd( c_tmp.v, c03_1.v );
  c03_2.v  = _mm256_mul_pd( c_tmp.v, c03_2.v );
  c03_3.v  = _mm256_mul_pd( c_tmp.v, c03_3.v );
  c47_0.v  = _mm256_mul_pd( c_tmp.v, c47_0.v );
  c47_1.v  = _mm256_mul_pd( c_tmp.v, c47_1.v );
  c47_2.v  = _mm256_mul_pd( c_tmp.v, c47_2.v );
  c47_3.v  = _mm256_mul_pd( c_tmp.v, c47_3.v );


  // Shift before the kernel evaluation
  c_tmp.v  = _mm256_broadcast_sd( &cons );
  c03_0.v  = _mm256_add_pd( c_tmp.v, c03_0.v );
  c03_1.v  = _mm256_add_pd( c_tmp.v, c03_1.v );
  c03_2.v  = _mm256_add_pd( c_tmp.v, c03_2.v );
  c03_3.v  = _mm256_add_pd( c_tmp.v, c03_3.v );
  c47_0.v  = _mm256_add_pd( c_tmp.v, c47_0.v );
  c47_1.v  = _mm256_add_pd( c_tmp.v, c47_1.v );
  c47_2.v  = _mm256_add_pd( c_tmp.v, c47_2.v );
  c47_3.v  = _mm256_add_pd( c_tmp.v, c47_3.v );


  // Preload u03, u47
  u03.v    = _mm256_load_pd( (double*)u );
  u47.v    = _mm256_load_pd( (double*)( u + 4 ) );


  // Prefetch u and w
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( u + 8 ) );
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( w ) );


  // c = tanh( c );
  c03_0.v  = _mm256_tanh_pd( c03_0.v );
  c03_1.v  = _mm256_tanh_pd( c03_1.v );
  c03_2.v  = _mm256_tanh_pd( c03_2.v );
  c03_3.v  = _mm256_tanh_pd( c03_3.v );
  c47_0.v  = _mm256_tanh_pd( c47_0.v );
  c47_1.v  = _mm256_tanh_pd( c47_1.v );
  c47_2.v  = _mm256_tanh_pd( c47_2.v );
  c47_3.v  = _mm256_tanh_pd( c47_3.v );
  
  
  // Multiple rhs kernel summation.
  #include "ks_kernel_summation_int_d8x4.h"

}
LIBXSMM_EXTERN_C LIBXSMM_RETARGETABLE
void stream_update_helmholtz_no_h2( const double* i_g1,
                                    const double* i_g2,
                                    const double* i_g3, 
                                    const double* i_tm1,
                                    const double* i_tm2,
                                    const double* i_tm3,
                                    double*       io_c,
                                    const double  i_h1,
                                    const int     i_length) {
  int l_n = 0;
  int l_trip_prolog = 0;
  int l_trip_stream = 0;
  
  /* init the trip counts */
  stream_init( i_length, (size_t)io_c, &l_trip_prolog, &l_trip_stream );
  
  /* run the prologue */
  for ( ; l_n < l_trip_prolog;  l_n++ ) {
    io_c[l_n] =   i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]);
  }
  /* run the bulk, hopefully using streaming stores */
#if defined(__SSE3__) && defined(__AVX__) && !defined(__AVX512F__)
  {
    const __m256d vec_h1 = _mm256_broadcast_sd(&i_h1);
    /* we need manual unrolling as the compiler otherwise generates 
       too many dependencies */
    for ( ; l_n < l_trip_stream;  l_n+=8 ) {
      __m256d vec_g1_1, vec_g2_1, vec_g3_1, vec_tm1_1, vec_tm2_1, vec_tm3_1;
      __m256d vec_g1_2, vec_g2_2, vec_g3_2, vec_tm1_2, vec_tm2_2, vec_tm3_2;

      vec_g1_1 = _mm256_loadu_pd(&(i_g1[l_n]));
      vec_tm1_1 = _mm256_loadu_pd(&(i_tm1[l_n]));
      vec_g1_2 = _mm256_loadu_pd(&(i_g1[l_n+4]));
      vec_tm1_2 = _mm256_loadu_pd(&(i_tm1[l_n+4]));

      vec_g1_1 = _mm256_mul_pd(vec_g1_1, vec_tm1_1);
      vec_g2_1 = _mm256_loadu_pd(&(i_g2[l_n]));
      vec_g1_2 = _mm256_mul_pd(vec_g1_2, vec_tm1_2);
      vec_g2_2 = _mm256_loadu_pd(&(i_g2[l_n+4]));

      vec_tm2_1 = _mm256_loadu_pd(&(i_tm2[l_n]));
      vec_g2_1 = _mm256_mul_pd(vec_g2_1, vec_tm2_1);
      vec_tm2_2 = _mm256_loadu_pd(&(i_tm2[l_n+4]));
      vec_g2_2 = _mm256_mul_pd(vec_g2_2, vec_tm2_2);

      vec_g3_1 = _mm256_loadu_pd(&(i_g3[l_n]));
      vec_tm3_1 = _mm256_loadu_pd(&(i_tm3[l_n]));
      vec_g3_2 = _mm256_loadu_pd(&(i_g3[l_n+4]));
      vec_tm3_2 = _mm256_loadu_pd(&(i_tm3[l_n+4]));

      vec_g3_1 = _mm256_mul_pd(vec_g3_1, vec_tm3_1);
      vec_g3_2 = _mm256_mul_pd(vec_g3_2, vec_tm3_2);
      vec_g1_1 = _mm256_add_pd(vec_g1_1, vec_g2_1);
      vec_g1_2 = _mm256_add_pd(vec_g1_2, vec_g2_2);

      vec_g1_1 = _mm256_add_pd(vec_g1_1, vec_g3_1);
#ifdef DISABLE_NONTEMPORAL_STORES
      _mm256_store_pd(  &(io_c[l_n]), _mm256_mul_pd(vec_g1_1, vec_h1) );
#else
      _mm256_stream_pd( &(io_c[l_n]), _mm256_mul_pd(vec_g1_1, vec_h1) );
#endif
      vec_g1_2 = _mm256_add_pd(vec_g1_2, vec_g3_2);
#ifdef DISABLE_NONTEMPORAL_STORES
      _mm256_store_pd(  &(io_c[l_n+4]), _mm256_mul_pd(vec_g1_2, vec_h1) );
#else
      _mm256_stream_pd( &(io_c[l_n+4]), _mm256_mul_pd(vec_g1_2, vec_h1) );
#endif
    }
  }
#elif defined(__SSE3__) && defined(__AVX__) && defined(__AVX512F__)
  {
    const __m512d vec_h1 = _mm512_broadcastsd_pd(_mm_load_sd(&i_h1));
    for ( ; l_n < l_trip_stream;  l_n+=8 ) {
      __m512d vec_g1, vec_g2, vec_g3, vec_tm1, vec_tm2, vec_tm3;
      vec_g1 = _mm512_loadu_pd(&(i_g1[l_n]));
      vec_tm1 = _mm512_loadu_pd(&(i_tm1[l_n]));
      vec_g1 = _mm512_mul_pd(vec_g1, vec_tm1);
      vec_g2 = _mm512_loadu_pd(&(i_g2[l_n]));
      vec_tm2 = _mm512_loadu_pd(&(i_tm2[l_n]));
      vec_g2 = _mm512_mul_pd(vec_g2, vec_tm2);
      vec_g3 = _mm512_loadu_pd(&(i_g3[l_n]));
      vec_tm3 = _mm512_loadu_pd(&(i_tm3[l_n]));
      vec_g3 = _mm512_mul_pd(vec_g3, vec_tm3);
      vec_g1 = _mm512_add_pd(vec_g1, vec_g2);
      vec_g1 = _mm512_add_pd(vec_g1, vec_g3);
#ifdef DISABLE_NONTEMPORAL_STORES
      _mm512_store_pd(  &(io_c[l_n]), _mm512_mul_pd(vec_g1, vec_h1) );
#else
      _mm512_stream_pd( &(io_c[l_n]), _mm512_mul_pd(vec_g1, vec_h1) );
#endif
    }
  }
#else
  for ( ; l_n < l_trip_stream;  l_n++ ) {
    io_c[l_n] =   i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]);
  }
#endif
  /* run the epilogue */
  for ( ; l_n < i_length;  l_n++ ) {
    io_c[l_n] =   i_h1*(i_g1[l_n]*i_tm1[l_n] + i_g2[l_n]*i_tm2[l_n] + i_g3[l_n]*i_tm3[l_n]);
  }
}