Esempio n. 1
0
boost::optional<double> SimpleClean::FindPeakAVX(const double *image, size_t width, size_t height, size_t& x, size_t& y, size_t startY, size_t endY, size_t horizontalBorder, size_t verticalBorder)
{
	double peakMax = std::numeric_limits<double>::min();
	size_t peakIndex = 0;
	
	__m256d mPeakMax = _mm256_set1_pd(peakMax);
	
	size_t xiStart = horizontalBorder, xiEnd = width - horizontalBorder;
	size_t yiStart = std::max(startY, verticalBorder), yiEnd = std::min(endY, height - verticalBorder);
	if(xiEnd < xiStart) xiEnd = xiStart;
	if(yiEnd < yiStart) yiEnd = yiStart;
	
	for(size_t yi=yiStart; yi!=yiEnd; ++yi)
	{
		size_t index = yi*width + xiStart;
		const double* const endPtr = image + yi*width + xiEnd - 4;
		const double *i=image + index;
		for(; i<endPtr; i+=4)
		{
			__m256d val = _mm256_loadu_pd(i);
			if(AllowNegativeComponent) {
				__m256d negVal = _mm256_sub_pd(_mm256_set1_pd(0.0), val);
				val = _mm256_max_pd(val, negVal);
			}
			int mask = _mm256_movemask_pd(_mm256_cmp_pd(val, mPeakMax, _CMP_GT_OQ));
			if(mask != 0)
			{
				for(size_t di=0; di!=4; ++di)
				{
					double value = i[di];
					if(AllowNegativeComponent) value = std::fabs(value);
					if(value > peakMax)
					{
						peakIndex = index+di;
						peakMax = std::fabs(i[di]);
						mPeakMax = _mm256_set1_pd(peakMax);
					}
				}
			}
			index+=4;
		}
		for(; i!=endPtr+4; ++i)
		{
			double value = *i;
			if(AllowNegativeComponent) value = std::fabs(value);
			if(value > peakMax)
			{
				peakIndex = index;
				peakMax = std::fabs(*i);
			}
			++index;
		}
	}
	x = peakIndex % width;
	y = peakIndex / width;
	return image[x + y*width];
}
Esempio n. 2
0
__SIMDd _SIMD_max_pd(__SIMDd a, __SIMDd b)
{
#ifdef  USE_SSE
  return _mm_max_pd(a,b); 
#elif defined USE_AVX
  return _mm256_max_pd(a,b); 
#elif defined USE_IBM
  return vec_max(a,b);
#endif
}
Esempio n. 3
0
 /*!
  * \brief Compute the maximum between each pair element of the given vectors
  */
 ETL_STATIC_INLINE(avx_simd_double) max(avx_simd_double lhs, avx_simd_double rhs) {
     return _mm256_max_pd(lhs.value, rhs.value);
 }
Esempio n. 4
0
// Process audio effects for 8 channels simultaneously:
void processEffects(const vec8_i32 &inpSamples, vec8_i32 &outSamples, const long n)
{
    // Extract int samples and convert to doubles:
    const vec4_d64 ds0 = _mm256_div_pd(
        _mm256_cvtepi32_pd(_mm256_extractf128_si256(inpSamples, 0)),
        _mm256_set1_pd((double)INT_MAX)
        );
    const vec4_d64 ds1 = _mm256_div_pd(
        _mm256_cvtepi32_pd(_mm256_extractf128_si256(inpSamples, 1)),
        _mm256_set1_pd((double)INT_MAX)
        );

    // Monitor input levels:
    fx.fi_monitor.levels[n + 0] = scalar_to_dBFS(ds0);
    fx.fi_monitor.levels[n + 1] = scalar_to_dBFS(ds1);

    vec4_d64 s0, s1;

    // f0_gain:
    {
        s0 = _mm256_mul_pd(ds0, fx.f0_gain.calc.gain[n + 0]);
        s1 = _mm256_mul_pd(ds1, fx.f0_gain.calc.gain[n + 1]);
    }

    // Monitor levels:
    fx.f0_output.levels[n + 0] = scalar_to_dBFS(s0);
    fx.f0_output.levels[n + 1] = scalar_to_dBFS(s1);

    // f1_compressor:
    {
        const vec4_dBFS l0 = scalar_to_dBFS_offs(s0);
        const vec4_dBFS l1 = scalar_to_dBFS_offs(s1);

        // over = s - thresh
        vec4_dB over0 = _mm256_sub_pd(l0, fx.f1_compressor.input.threshold[n + 0]);
        vec4_dB over1 = _mm256_sub_pd(l1, fx.f1_compressor.input.threshold[n + 1]);

        // over = if over < 0.0 then 0.0 else over;
        over0 = mm256_if_then_else(_mm256_cmp_pd(over0, _mm256_set1_pd(0.0), _CMP_LT_OQ), _mm256_set1_pd(0.0), over0);
        over1 = mm256_if_then_else(_mm256_cmp_pd(over1, _mm256_set1_pd(0.0), _CMP_LT_OQ), _mm256_set1_pd(0.0), over1);

        // over += DC_OFFSET
        over0 = _mm256_add_pd(over0, DC_OFFSET);
        over1 = _mm256_add_pd(over1, DC_OFFSET);

        // env = over + coef * ( env - over )
        const vec4_dB attack_env0  = _mm256_add_pd(over0, _mm256_mul_pd(fx.f1_compressor.calc.attack_coef[n + 0], _mm256_sub_pd(fx.f1_compressor.state.env[n + 0], over0)));
        const vec4_dB attack_env1  = _mm256_add_pd(over1, _mm256_mul_pd(fx.f1_compressor.calc.attack_coef[n + 1], _mm256_sub_pd(fx.f1_compressor.state.env[n + 1], over1)));
        const vec4_dB release_env0  = _mm256_add_pd(over0, _mm256_mul_pd(fx.f1_compressor.calc.release_coef[n + 0], _mm256_sub_pd(fx.f1_compressor.state.env[n + 0], over0)));
        const vec4_dB release_env1  = _mm256_add_pd(over1, _mm256_mul_pd(fx.f1_compressor.calc.release_coef[n + 1], _mm256_sub_pd(fx.f1_compressor.state.env[n + 1], over1)));

        // env = if over > env then attack_env else release_env
        fx.f1_compressor.state.env[n + 0] = mm256_if_then_else(_mm256_cmp_pd(over0, fx.f1_compressor.state.env[n + 0], _CMP_GT_OQ), attack_env0, release_env0);
        fx.f1_compressor.state.env[n + 1] = mm256_if_then_else(_mm256_cmp_pd(over1, fx.f1_compressor.state.env[n + 1], _CMP_GT_OQ), attack_env1, release_env1);

        // over = env - DC_OFFSET
        over0 = _mm256_sub_pd(fx.f1_compressor.state.env[n + 0], DC_OFFSET);
        over1 = _mm256_sub_pd(fx.f1_compressor.state.env[n + 1], DC_OFFSET);

        // grdB = ( over * ( ratio - 1.0 ) )
        vec4_dB gr0dB = _mm256_mul_pd(over0, fx.f1_compressor.calc.ratio_min_1[n + 0]);
        vec4_dB gr1dB = _mm256_mul_pd(over1, fx.f1_compressor.calc.ratio_min_1[n + 1]);

        // gr = dB_to_scalar(grdB)
        fx.f1_compressor.monitor.gain_reduction[n + 0] = dB_to_scalar(gr0dB);
        fx.f1_compressor.monitor.gain_reduction[n + 1] = dB_to_scalar(gr1dB);

        // Apply gain reduction to inputs:
        s0 = _mm256_mul_pd(s0, fx.f1_compressor.monitor.gain_reduction[n + 0]);
        s1 = _mm256_mul_pd(s1, fx.f1_compressor.monitor.gain_reduction[n + 1]);

        // Apply make-up gain:
        s0 = _mm256_mul_pd(s0, fx.f1_compressor.calc.gain[n + 0]);
        s1 = _mm256_mul_pd(s1, fx.f1_compressor.calc.gain[n + 1]);
    }

    // Monitor output levels:
    fx.fo_monitor.levels[n + 0] = scalar_to_dBFS(s0);
    fx.fo_monitor.levels[n + 1] = scalar_to_dBFS(s1);

    // TODO(jsd): Better limiter implementation!
    // Limit final samples:
    s0 = _mm256_max_pd(_mm256_min_pd(s0, _mm256_set1_pd((double)1.0)), _mm256_set1_pd((double)-1.0));
    s1 = _mm256_max_pd(_mm256_min_pd(s1, _mm256_set1_pd((double)1.0)), _mm256_set1_pd((double)-1.0));

    // Convert doubles back to 32-bit ints:
    s0 = _mm256_mul_pd(s0, _mm256_set1_pd((double)INT_MAX));
    s1 = _mm256_mul_pd(s1, _mm256_set1_pd((double)INT_MAX));
    const vec8_i32 os = _mm256_setr_m128i(_mm256_cvtpd_epi32(s0), _mm256_cvtpd_epi32(s1));

    // Write outputs:
    _mm256_stream_si256(&outSamples, os);
}
Esempio n. 5
0
 inline vector4d max(const vector4d& lhs, const vector4d& rhs)
 {
     return _mm256_max_pd(lhs, rhs);
 }
Esempio n. 6
0
 inline F64vec4 max(const F64vec4 &l, const F64vec4 &r)
 {
     return _mm256_max_pd(l, r);
 }
Esempio n. 7
0
BI_FORCE_INLINE inline avx_double max(const avx_double x,
    const avx_double y) {
  avx_double res;
  res.packed = _mm256_max_pd(x.packed, y.packed);
  return res;
}
Esempio n. 8
0
void ks_multiquadratic_int_d8x4(
    int    k,
    int    rhs,
    double *u,
    double *aa,
    double *a,
    double *bb,
    double *b,
    double *w,
    double *c,
    ks_t   *ker,
    aux_t  *aux
    )
{
  int    i, rhs_left;
  double neg2  = -2.0;
  double dzero =  0.0;
  double done  =  1.0;
  double mdone = -1.0;
  double alpha = ( 3.0 / 4.0 );
  double cons  = ker->cons;


  v4df_t    c03_0,    c03_1,    c03_2,    c03_3;
  v4df_t    c47_0,    c47_1,    c47_2,    c47_3;
  v4df_t tmpc03_0, tmpc03_1, tmpc03_2, tmpc03_3;
  v4df_t tmpc47_0, tmpc47_1, tmpc47_2, tmpc47_3;
  v4df_t u03, u47;
  v4df_t a03, a47, A03, A47; // prefetched A 
  v4df_t b0, b1, b2, b3, B0; // prefetched B
  v4df_t c_tmp, aa_tmp, bb_tmp, w_tmp;


  // Rank-k update segment
  #include "ks_rank_k_int_d8x4.h"


  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( aa ) );
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( bb ) );


  // Accumulate
  if ( aux->pc ) {
    tmpc03_0.v = _mm256_load_pd( (double*)( c      ) );
    c03_0.v    = _mm256_add_pd( tmpc03_0.v, c03_0.v );
    tmpc47_0.v = _mm256_load_pd( (double*)( c + 4  ) );
    c47_0.v    = _mm256_add_pd( tmpc47_0.v, c47_0.v );
    tmpc03_1.v = _mm256_load_pd( (double*)( c + 8  ) );
    c03_1.v    = _mm256_add_pd( tmpc03_1.v, c03_1.v );
    tmpc47_1.v = _mm256_load_pd( (double*)( c + 12 ) );
    c47_1.v    = _mm256_add_pd( tmpc47_1.v, c47_1.v );
    tmpc03_2.v = _mm256_load_pd( (double*)( c + 16 ) );
    c03_2.v    = _mm256_add_pd( tmpc03_2.v, c03_2.v );
    tmpc47_2.v = _mm256_load_pd( (double*)( c + 20 ) );
    c47_2.v    = _mm256_add_pd( tmpc47_2.v, c47_2.v );
    tmpc03_3.v = _mm256_load_pd( (double*)( c + 24 ) );
    c03_3.v    = _mm256_add_pd( tmpc03_3.v, c03_3.v );
    tmpc47_3.v = _mm256_load_pd( (double*)( c + 28 ) );
    c47_3.v    = _mm256_add_pd( tmpc47_3.v, c47_3.v );
  }


  // Scale -2
  aa_tmp.v = _mm256_broadcast_sd( &neg2 );
  c03_0.v  = _mm256_mul_pd( aa_tmp.v, c03_0.v );
  c03_1.v  = _mm256_mul_pd( aa_tmp.v, c03_1.v );
  c03_2.v  = _mm256_mul_pd( aa_tmp.v, c03_2.v );
  c03_3.v  = _mm256_mul_pd( aa_tmp.v, c03_3.v );
  c47_0.v  = _mm256_mul_pd( aa_tmp.v, c47_0.v );
  c47_1.v  = _mm256_mul_pd( aa_tmp.v, c47_1.v );
  c47_2.v  = _mm256_mul_pd( aa_tmp.v, c47_2.v );
  c47_3.v  = _mm256_mul_pd( aa_tmp.v, c47_3.v );


  aa_tmp.v = _mm256_load_pd( (double*)aa );
  c03_0.v  = _mm256_add_pd( aa_tmp.v, c03_0.v );
  c03_1.v  = _mm256_add_pd( aa_tmp.v, c03_1.v );
  c03_2.v  = _mm256_add_pd( aa_tmp.v, c03_2.v );
  c03_3.v  = _mm256_add_pd( aa_tmp.v, c03_3.v );


  aa_tmp.v = _mm256_load_pd( (double*)( aa + 4 ) );
  c47_0.v  = _mm256_add_pd( aa_tmp.v, c47_0.v );
  c47_1.v  = _mm256_add_pd( aa_tmp.v, c47_1.v );
  c47_2.v  = _mm256_add_pd( aa_tmp.v, c47_2.v );
  c47_3.v  = _mm256_add_pd( aa_tmp.v, c47_3.v );
  

  bb_tmp.v = _mm256_broadcast_sd( (double*)bb );
  c03_0.v  = _mm256_add_pd( bb_tmp.v, c03_0.v );
  c47_0.v  = _mm256_add_pd( bb_tmp.v, c47_0.v );

  bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 1 ) );
  c03_1.v  = _mm256_add_pd( bb_tmp.v, c03_1.v );
  c47_1.v  = _mm256_add_pd( bb_tmp.v, c47_1.v );

  bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 2 ) );
  c03_2.v  = _mm256_add_pd( bb_tmp.v, c03_2.v );
  c47_2.v  = _mm256_add_pd( bb_tmp.v, c47_2.v );

  bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 3 ) );
  c03_3.v  = _mm256_add_pd( bb_tmp.v, c03_3.v );
  c47_3.v  = _mm256_add_pd( bb_tmp.v, c47_3.v );


  // Check if there is any illegle value 
  c_tmp.v  = _mm256_broadcast_sd( &dzero );
  c03_0.v  = _mm256_max_pd( c_tmp.v, c03_0.v );
  c03_1.v  = _mm256_max_pd( c_tmp.v, c03_1.v );
  c03_2.v  = _mm256_max_pd( c_tmp.v, c03_2.v );
  c03_3.v  = _mm256_max_pd( c_tmp.v, c03_3.v );
  c47_0.v  = _mm256_max_pd( c_tmp.v, c47_0.v );
  c47_1.v  = _mm256_max_pd( c_tmp.v, c47_1.v );
  c47_2.v  = _mm256_max_pd( c_tmp.v, c47_2.v );
  c47_3.v  = _mm256_max_pd( c_tmp.v, c47_3.v );


  // Preload u03, u47
  u03.v    = _mm256_load_pd( (double*)u );
  u47.v    = _mm256_load_pd( (double*)( u + 4 ) );


  // Prefetch u and w
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( u + 8 ) );
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( w ) );


  // c = c + cons
  c_tmp.v  = _mm256_broadcast_sd( &cons );
  c03_0.v  = _mm256_add_pd( c_tmp.v, c03_0.v );
  c03_1.v  = _mm256_add_pd( c_tmp.v, c03_1.v );
  c03_2.v  = _mm256_add_pd( c_tmp.v, c03_2.v );
  c03_3.v  = _mm256_add_pd( c_tmp.v, c03_3.v );
  c47_0.v  = _mm256_add_pd( c_tmp.v, c47_0.v );
  c47_1.v  = _mm256_add_pd( c_tmp.v, c47_1.v );
  c47_2.v  = _mm256_add_pd( c_tmp.v, c47_2.v );
  c47_3.v  = _mm256_add_pd( c_tmp.v, c47_3.v );

  // Multiple rhs kernel summation.
  #include "ks_kernel_summation_int_d8x4.h"

}
Esempio n. 9
0
void rnn_int_d8x4_var2(
    int    k,
    double *aa,
    double *a,
    double *bb,
    double *b,
    double *c,
    aux_t  *aux
    )
{
  int    i;
  double neg2 = -2.0;
  double dzero = 0.0;
  v4df_t c03_0, c03_1, c03_2, c03_3;
  v4df_t c47_0, c47_1, c47_2, c47_3;
  v4df_t tmpc03_0, tmpc03_1, tmpc03_2, tmpc03_3;
  v4df_t tmpc47_0, tmpc47_1, tmpc47_2, tmpc47_3;
  v4df_t c_tmp;
  v4df_t a03, a47;
  v4df_t A03, A47; // prefetched A 

  v4df_t b0, b1, b2, b3;
  v4df_t B0; // prefetched B
  v4df_t aa_tmp, bb_tmp;


  int k_iter = k / 2;
  int k_left = k % 2;

  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( a ) );
  __asm__ volatile( "prefetcht2 0(%0)    \n\t" : :"r"( aux->b_next ) );
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( c ) );


  c03_0.v = _mm256_setzero_pd();
  c03_1.v = _mm256_setzero_pd();
  c03_2.v = _mm256_setzero_pd();
  c03_3.v = _mm256_setzero_pd();
  c47_0.v = _mm256_setzero_pd();
  c47_1.v = _mm256_setzero_pd();
  c47_2.v = _mm256_setzero_pd();
  c47_3.v = _mm256_setzero_pd();


  // Load a03
  a03.v = _mm256_load_pd(      (double*)a         );
  // Load a47
  a47.v = _mm256_load_pd(      (double*)( a + 4 ) );
  // Load (b0,b1,b2,b3)
  b0.v  = _mm256_load_pd(      (double*)b         );

  for ( i = 0; i < k_iter; ++i ) {
    __asm__ volatile( "prefetcht0 192(%0)    \n\t" : :"r"(a) );

    // Preload A03
    A03.v = _mm256_load_pd(      (double*)( a + 8 ) );

    c_tmp.v = _mm256_mul_pd( a03.v  , b0.v    );
    c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b0.v    );
    c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v );

    // Preload A47
    A47.v = _mm256_load_pd(      (double*)( a + 12 ) );

    // Shuffle b ( 1, 0, 3, 2 )
    b1.v  = _mm256_shuffle_pd( b0.v, b0.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b1.v    );
    c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b1.v    );
    c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v );

    // Permute b ( 3, 2, 1, 0 )
    b2.v  = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 );

    // Preload B0
    B0.v  = _mm256_load_pd(      (double*)( b + 4 ) );

    c_tmp.v = _mm256_mul_pd( a03.v  , b2.v    );
    c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b2.v    );
    c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v );

    // Shuffle b ( 3, 2, 1, 0 )
    b3.v  = _mm256_shuffle_pd( b2.v, b2.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b3.v    );
    c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b3.v    );
    c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v );


    // Iteration #1
    __asm__ volatile( "prefetcht0 512(%0)    \n\t" : :"r"(a) );

    // Preload a03 ( next iteration )
    a03.v = _mm256_load_pd(      (double*)( a + 16 ) );

    c_tmp.v = _mm256_mul_pd( A03.v  , B0.v    );
    c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v );

    b1.v  = _mm256_shuffle_pd( B0.v, B0.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( A47.v  , B0.v    );
    c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v );
    c_tmp.v = _mm256_mul_pd( A03.v  , b1.v    );
    c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v );

    // Preload a47 ( next iteration )
    a47.v = _mm256_load_pd(      (double*)( a + 20 ) );

    // Permute b ( 3, 2, 1, 0 )
    b2.v  = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 );

    c_tmp.v = _mm256_mul_pd( A47.v  , b1.v    );
    c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v );
    c_tmp.v = _mm256_mul_pd( A03.v  , b2.v    );
    c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v );

    // Shuffle b ( 3, 2, 1, 0 )
    b3.v  = _mm256_shuffle_pd( b2.v, b2.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( A47.v  , b2.v    );
    c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v );

    // Load b0 ( next iteration )
    b0.v  = _mm256_load_pd(      (double*)( b + 8 ) );

    c_tmp.v = _mm256_mul_pd( A03.v  , b3.v    );
    c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v );
    c_tmp.v = _mm256_mul_pd( A47.v  , b3.v    );
    c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v );

    a += 16;
    b += 8;
  }

  for ( i = 0; i < k_left; ++i ) {
    a03.v = _mm256_load_pd(      (double*)a         );
    //printf( "a03 = %lf, %lf, %lf, %lf\n", a03.d[0], a03.d[1], a03.d[2], a03.d[3] );

    a47.v = _mm256_load_pd(      (double*)( a + 4 ) );
    //printf( "a47 = %lf, %lf, %lf, %lf\n", a47.d[0], a47.d[1], a47.d[2], a47.d[3] );

    b0.v  = _mm256_load_pd(      (double*)b         );
    //printf( "b0  = %lf, %lf, %lf, %lf\n", b0.d[0], b0.d[1], b0.d[2], b0.d[3] );

    c_tmp.v = _mm256_mul_pd( a03.v  , b0.v    );
    c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b0.v    );
    c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v );

    // Shuffle b ( 1, 0, 3, 2 )
    b1.v  = _mm256_shuffle_pd( b0.v, b0.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b1.v    );
    c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b1.v    );
    c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v );

    // Permute b ( 3, 2, 1, 0 )
    b2.v  = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b2.v    );
    c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b2.v    );
    c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v );

    // Shuffle b ( 3, 2, 1, 0 )
    b3.v  = _mm256_shuffle_pd( b2.v, b2.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b3.v    );
    c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b3.v    );
    c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v );

    a += 8;
    b += 4;
  }
 

  // Prefetch aa and bb
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( aa ) );
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( bb ) );


  tmpc03_0.v = _mm256_blend_pd( c03_0.v, c03_1.v, 0x6 );
  tmpc03_1.v = _mm256_blend_pd( c03_1.v, c03_0.v, 0x6 );
  
  tmpc03_2.v = _mm256_blend_pd( c03_2.v, c03_3.v, 0x6 );
  tmpc03_3.v = _mm256_blend_pd( c03_3.v, c03_2.v, 0x6 );

  tmpc47_0.v = _mm256_blend_pd( c47_0.v, c47_1.v, 0x6 );
  tmpc47_1.v = _mm256_blend_pd( c47_1.v, c47_0.v, 0x6 );

  tmpc47_2.v = _mm256_blend_pd( c47_2.v, c47_3.v, 0x6 );
  tmpc47_3.v = _mm256_blend_pd( c47_3.v, c47_2.v, 0x6 );

  c03_0.v    = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x30 );
  c03_3.v    = _mm256_permute2f128_pd( tmpc03_2.v, tmpc03_0.v, 0x30 );

  c03_1.v    = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x30 );
  c03_2.v    = _mm256_permute2f128_pd( tmpc03_3.v, tmpc03_1.v, 0x30 );

  c47_0.v    = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x30 );
  c47_3.v    = _mm256_permute2f128_pd( tmpc47_2.v, tmpc47_0.v, 0x30 );

  c47_1.v    = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x30 );
  c47_2.v    = _mm256_permute2f128_pd( tmpc47_3.v, tmpc47_1.v, 0x30 );

  //printf( "rank-k\n" );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] );

  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( aux->I ) );
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( aux->D ) );


  //for ( i = 0; i < k; i++ ) {
  //  a03.v = _mm256_load_pd(      (double*)a         );
  //  a47.v = _mm256_load_pd(      (double*)( a + 4 ) );
  //  b0.v  = _mm256_broadcast_sd( (double*)b         );
  //  b1.v  = _mm256_broadcast_sd( (double*)( b + 1 ) );
  //  b2.v  = _mm256_broadcast_sd( (double*)( b + 2 ) );
  //  b3.v  = _mm256_broadcast_sd( (double*)( b + 3 ) );

  //  a += DKS_MR;
  //  b += DKS_NR;

  //  c_tmp.v = _mm256_mul_pd( a03.v  , b0.v    );
  //  c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v );
  //  c_tmp.v = _mm256_mul_pd( a03.v  , b1.v    );
  //  c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v );
  //  c_tmp.v = _mm256_mul_pd( a03.v  , b2.v    );
  //  c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v );
  //  c_tmp.v = _mm256_mul_pd( a03.v  , b3.v    );
  //  c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v );

  //  c_tmp.v = _mm256_mul_pd( a47.v  , b0.v    );
  //  c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v );
  //  c_tmp.v = _mm256_mul_pd( a47.v  , b1.v    );
  //  c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v );
  //  c_tmp.v = _mm256_mul_pd( a47.v  , b2.v    );
  //  c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v );
  //  c_tmp.v = _mm256_mul_pd( a47.v  , b3.v    );
  //  c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v );
  //}
  
  aa_tmp.v = _mm256_broadcast_sd( &neg2 );
  //c03_0.v  = _mm256_mul_pd( aa_tmp.v, c03_0.v );
  //c03_1.v  = _mm256_mul_pd( aa_tmp.v, c03_1.v );
  //c03_2.v  = _mm256_mul_pd( aa_tmp.v, c03_2.v );
  //c03_3.v  = _mm256_mul_pd( aa_tmp.v, c03_3.v );
  //c47_0.v  = _mm256_mul_pd( aa_tmp.v, c47_0.v );
  //c47_1.v  = _mm256_mul_pd( aa_tmp.v, c47_1.v );
  //c47_2.v  = _mm256_mul_pd( aa_tmp.v, c47_2.v );
  //c47_3.v  = _mm256_mul_pd( aa_tmp.v, c47_3.v );
  //
  c03_0.v  = _mm256_mul_pd( aa_tmp.v, c03_0.v );
  c03_1.v  = _mm256_mul_pd( aa_tmp.v, c03_1.v );
  c03_2.v  = _mm256_mul_pd( aa_tmp.v, c03_2.v );
  c03_3.v  = _mm256_mul_pd( aa_tmp.v, c03_3.v );
  c47_0.v  = _mm256_mul_pd( aa_tmp.v, c47_0.v );
  c47_1.v  = _mm256_mul_pd( aa_tmp.v, c47_1.v );
  c47_2.v  = _mm256_mul_pd( aa_tmp.v, c47_2.v );
  c47_3.v  = _mm256_mul_pd( aa_tmp.v, c47_3.v );


  //printf( "scale -2 \n" );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] );


  aa_tmp.v = _mm256_load_pd( (double*)aa );
  c03_0.v  = _mm256_add_pd( aa_tmp.v, c03_0.v );
  c03_1.v  = _mm256_add_pd( aa_tmp.v, c03_1.v );
  c03_2.v  = _mm256_add_pd( aa_tmp.v, c03_2.v );
  c03_3.v  = _mm256_add_pd( aa_tmp.v, c03_3.v );

  //printf( "aa03 = %lf, %lf, %lf, %lf\n", aa_tmp.d[0], aa_tmp.d[1], aa_tmp.d[2], aa_tmp.d[3] );
  //printf( "bb03 = %lf, %lf, %lf, %lf\n", bb[ 0 ], bb[ 1 ], bb[ 2 ], bb[ 3 ] );

  aa_tmp.v = _mm256_load_pd( (double*)( aa + 4 ) );
  c47_0.v  = _mm256_add_pd( aa_tmp.v, c47_0.v );
  c47_1.v  = _mm256_add_pd( aa_tmp.v, c47_1.v );
  c47_2.v  = _mm256_add_pd( aa_tmp.v, c47_2.v );
  c47_3.v  = _mm256_add_pd( aa_tmp.v, c47_3.v );
  

  //printf( "add a^2\n" );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] );


  bb_tmp.v = _mm256_broadcast_sd( (double*)bb );
  c03_0.v  = _mm256_add_pd( bb_tmp.v, c03_0.v );
  c47_0.v  = _mm256_add_pd( bb_tmp.v, c47_0.v );

  bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 1 ) );
  c03_1.v  = _mm256_add_pd( bb_tmp.v, c03_1.v );
  c47_1.v  = _mm256_add_pd( bb_tmp.v, c47_1.v );

  bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 2 ) );
  c03_2.v  = _mm256_add_pd( bb_tmp.v, c03_2.v );
  c47_2.v  = _mm256_add_pd( bb_tmp.v, c47_2.v );

  bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 3 ) );
  c03_3.v  = _mm256_add_pd( bb_tmp.v, c03_3.v );
  c47_3.v  = _mm256_add_pd( bb_tmp.v, c47_3.v );



  // Check if there is any illegle value 
  c_tmp.v  = _mm256_broadcast_sd( &dzero );
  c03_0.v  = _mm256_max_pd( c_tmp.v, c03_0.v );
  c03_1.v  = _mm256_max_pd( c_tmp.v, c03_1.v );
  c03_2.v  = _mm256_max_pd( c_tmp.v, c03_2.v );
  c03_3.v  = _mm256_max_pd( c_tmp.v, c03_3.v );
  c47_0.v  = _mm256_max_pd( c_tmp.v, c47_0.v );
  c47_1.v  = _mm256_max_pd( c_tmp.v, c47_1.v );
  c47_2.v  = _mm256_max_pd( c_tmp.v, c47_2.v );
  c47_3.v  = _mm256_max_pd( c_tmp.v, c47_3.v );


  // Transpose c03/c47 _0, _1, _2, _3 to be the row vector
  tmpc03_0.v = _mm256_shuffle_pd( c03_0.v, c03_1.v, 0x0 );
  tmpc03_1.v = _mm256_shuffle_pd( c03_0.v, c03_1.v, 0xF );

  tmpc03_2.v = _mm256_shuffle_pd( c03_2.v, c03_3.v, 0x0 );
  tmpc03_3.v = _mm256_shuffle_pd( c03_2.v, c03_3.v, 0xF );

  tmpc47_0.v = _mm256_shuffle_pd( c47_0.v, c47_1.v, 0x0 );
  tmpc47_1.v = _mm256_shuffle_pd( c47_0.v, c47_1.v, 0xF );

  tmpc47_2.v = _mm256_shuffle_pd( c47_2.v, c47_3.v, 0x0 );
  tmpc47_3.v = _mm256_shuffle_pd( c47_2.v, c47_3.v, 0xF );

  c03_0.v    = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x20 );
  c03_2.v    = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x31 );

  c03_1.v    = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x20 );
  c03_3.v    = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x31 );

  c47_0.v    = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x20 );
  c47_2.v    = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x31 );

  c47_1.v    = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x20 );
  c47_3.v    = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x31 );


  // c03_0;
  // c03_1;
  // c03_2;
  // c03_3;
  // c47_0;
  // c47_1;
  // c47_2;
  // c47_3;


  _mm256_store_pd( c     , c03_0.v );
  _mm256_store_pd( c +  4, c03_1.v );
  _mm256_store_pd( c +  8, c03_2.v );
  _mm256_store_pd( c + 12, c03_3.v );
  _mm256_store_pd( c + 16, c47_0.v );
  _mm256_store_pd( c + 20, c47_1.v );
  _mm256_store_pd( c + 24, c47_2.v );
  _mm256_store_pd( c + 28, c47_3.v );
}
Esempio n. 10
0
void ks_gaussian_int_d8x4(
    int    k,
    double alpha,
    double *u,
    double *aa,
    double *a,
    double *bb,
    double *b,
    double *w,
    aux_t  *aux
    )
{
  int    i;
  double neg2 = -2.0;
  double dzero = 0.0;

  v4df_t c03_0, c03_1, c03_2, c03_3;
  v4df_t c47_0, c47_1, c47_2, c47_3;
  v4df_t tmpc03_0, tmpc03_1, tmpc03_2, tmpc03_3;
  v4df_t tmpc47_0, tmpc47_1, tmpc47_2, tmpc47_3;
  v4df_t c_tmp;
  v4df_t u03;
  v4df_t u47;
  v4df_t a03, a47;
  v4df_t A03, A47; // prefetched A 

  v4df_t b0, b1, b2, b3;
  v4df_t B0; // prefetched B

  v4df_t aa_tmp, bb_tmp;
  v4df_t w_tmp;


  //// Inline vdExp()
  //const double log2e  =  1.4426950408889634073599;
  //const double maxlog =  7.09782712893383996843e2; // log( 2**1024 )
  //const double minlog = -7.08396418532264106224e2; // log( 2**-1024 )
  //const double one    =  1.0;
  //const double c1     =  6.93145751953125E-1;
  //const double c2     =  1.42860682030941723212E-6;

  //// Original Remez Order 11 coefficients
  //const double w11    =  3.5524625185478232665958141148891055719216674475023e-8;
  //const double w10    =  2.5535368519306500343384723775435166753084614063349e-7;
  //const double w9     =  2.77750562801295315877005242757916081614772210463065e-6;
  //const double w8     =  2.47868893393199945541176652007657202642495832996107e-5;
  //const double w7     =  1.98419213985637881240770890090795533564573406893163e-4;
  //const double w6     =  1.3888869684178659239014256260881685824525255547326e-3;
  //const double w5     =  8.3333337052009872221152811550156335074160546333973e-3;
  //const double w4     =  4.1666666621080810610346717440523105184720007971655e-2;
  //const double w3     =  0.166666666669960803484477734308515404418108830469798;
  //const double w2     =  0.499999999999877094481580370323249951329122224389189;
  //const double w1     =  1.0000000000000017952745258419615282194236357388884;
  //const double w0     =  0.99999999999999999566016490920259318691496540598896;

  // Remez Order 11 polynomail approximation
  //const double w0     =  9.9999999999999999694541216787022234814339814028865e-1;
  //const double w1     =  1.0000000000000013347525109964212249781265243645457;
  //const double w2     =  4.9999999999990426011279542064313207349934058355357e-1;
  //const double w3     =  1.6666666666933781279020916199156875162816850273886e-1;
  //const double w4     =  4.1666666628388978913396218847247771982698350546174e-2;
  //const double w5     =  8.3333336552944126722390410619859929515740995889372e-3;
  //const double w6     =  1.3888871805082296012945081624687544823497126781709e-3;
  //const double w7     =  1.9841863599469418342286677256362193951266072398489e-4;
  //const double w8     =  2.4787899938611697691690479138150629377630767114546e-5;
  //const double w9     =  2.7764095757136528235740765949934667970688427190168e-6;
  //const double w10    =  2.5602485412126369546033948405199058329040797134573e-7;
  //const double w11    =  3.5347283721656121939634391175390704621351283546671e-8;

  // Remez Order 9 polynomail approximation
//  const double w0     =  9.9999999999998657717890998293462356769270934668652e-1;
//  const double w1     =  1.0000000000041078023971691258305486059867172736079;
//  const double w2     =  4.9999999979496223000111361187419539211772440139043e-1;
//  const double w3     =  1.6666667059968250851708016603646727895353772273675e-1;
//  const double w4     =  4.1666628655740875994884332519499013211594753124142e-2;
//  const double w5     =  8.3335428149736685441705398632467122758546893330069e-3;
//  const double w6     =  1.3881912931358424526285652289974115047170651985345e-3;
//  const double w7     =  1.9983735415194021112767942931416179152416729204150e-4;
//  const double w8     =  2.3068467290270483679711135625155862511780587976925e-5;
//  const double w9     =  3.8865682386514872192656192137071689334005518164704e-6;




  //v4df_t a03_0, a03_1, a03_2, a03_3;
  //v4df_t a47_0, a47_1, a47_2, a47_3;
  //v4df_t p03_0, p03_1, p03_2, p03_3;
  //v4df_t p47_0, p47_1, p47_2, p47_3;
  //v4df_t y, l2e, tmp, p;
  //v4li_t k03_0, k03_1, k03_2, k03_3;
  //v4li_t k47_0, k47_1, k47_2, k47_3;
  //v4li_t offset;
  //v4li_t k1, k2;
  //__m128d p1, p2;









  int k_iter = k / 2;
  int k_left = k % 2;

  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( a ) );
  __asm__ volatile( "prefetcht2 0(%0)    \n\t" : :"r"( aux->b_next ) );


  c03_0.v = _mm256_setzero_pd();
  c03_1.v = _mm256_setzero_pd();
  c03_2.v = _mm256_setzero_pd();
  c03_3.v = _mm256_setzero_pd();
  c47_0.v = _mm256_setzero_pd();
  c47_1.v = _mm256_setzero_pd();
  c47_2.v = _mm256_setzero_pd();
  c47_3.v = _mm256_setzero_pd();


  // Load a03
  a03.v = _mm256_load_pd(      (double*)a         );
  // Load a47
  a47.v = _mm256_load_pd(      (double*)( a + 4 ) );
  // Load (b0,b1,b2,b3)
  b0.v  = _mm256_load_pd(      (double*)b         );

  for ( i = 0; i < k_iter; ++i ) {
    __asm__ volatile( "prefetcht0 192(%0)    \n\t" : :"r"(a) );

    // Preload A03
    A03.v = _mm256_load_pd(      (double*)( a + 8 ) );

    c_tmp.v = _mm256_mul_pd( a03.v  , b0.v    );
    c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b0.v    );
    c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v );

    // Preload A47
    A47.v = _mm256_load_pd(      (double*)( a + 12 ) );

    // Shuffle b ( 1, 0, 3, 2 )
    b1.v  = _mm256_shuffle_pd( b0.v, b0.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b1.v    );
    c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b1.v    );
    c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v );

    // Permute b ( 3, 2, 1, 0 )
    b2.v  = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 );

    // Preload B0
    B0.v  = _mm256_load_pd(      (double*)( b + 4 ) );

    c_tmp.v = _mm256_mul_pd( a03.v  , b2.v    );
    c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b2.v    );
    c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v );

    // Shuffle b ( 3, 2, 1, 0 )
    b3.v  = _mm256_shuffle_pd( b2.v, b2.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b3.v    );
    c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b3.v    );
    c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v );


    // Iteration #1
    __asm__ volatile( "prefetcht0 512(%0)    \n\t" : :"r"(a) );

    // Preload a03 ( next iteration )
    a03.v = _mm256_load_pd(      (double*)( a + 16 ) );

    c_tmp.v = _mm256_mul_pd( A03.v  , B0.v    );
    c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v );

    b1.v  = _mm256_shuffle_pd( B0.v, B0.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( A47.v  , B0.v    );
    c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v );
    c_tmp.v = _mm256_mul_pd( A03.v  , b1.v    );
    c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v );

    // Preload a47 ( next iteration )
    a47.v = _mm256_load_pd(      (double*)( a + 20 ) );

    // Permute b ( 3, 2, 1, 0 )
    b2.v  = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 );

    c_tmp.v = _mm256_mul_pd( A47.v  , b1.v    );
    c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v );
    c_tmp.v = _mm256_mul_pd( A03.v  , b2.v    );
    c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v );

    // Shuffle b ( 3, 2, 1, 0 )
    b3.v  = _mm256_shuffle_pd( b2.v, b2.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( A47.v  , b2.v    );
    c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v );

    // Load b0 ( next iteration )
    b0.v  = _mm256_load_pd(      (double*)( b + 8 ) );

    c_tmp.v = _mm256_mul_pd( A03.v  , b3.v    );
    c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v );
    c_tmp.v = _mm256_mul_pd( A47.v  , b3.v    );
    c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v );

    a += 16;
    b += 8;
  }

  for ( i = 0; i < k_left; ++i ) {
    a03.v = _mm256_load_pd(      (double*)a         );
    //printf( "a03 = %lf, %lf, %lf, %lf\n", a03.d[0], a03.d[1], a03.d[2], a03.d[3] );

    a47.v = _mm256_load_pd(      (double*)( a + 4 ) );
    //printf( "a47 = %lf, %lf, %lf, %lf\n", a47.d[0], a47.d[1], a47.d[2], a47.d[3] );

    b0.v  = _mm256_load_pd(      (double*)b         );
    //printf( "b0  = %lf, %lf, %lf, %lf\n", b0.d[0], b0.d[1], b0.d[2], b0.d[3] );

    c_tmp.v = _mm256_mul_pd( a03.v  , b0.v    );
    c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b0.v    );
    c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v );

    // Shuffle b ( 1, 0, 3, 2 )
    b1.v  = _mm256_shuffle_pd( b0.v, b0.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b1.v    );
    c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b1.v    );
    c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v );

    // Permute b ( 3, 2, 1, 0 )
    b2.v  = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b2.v    );
    c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b2.v    );
    c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v );

    // Shuffle b ( 3, 2, 1, 0 )
    b3.v  = _mm256_shuffle_pd( b2.v, b2.v, 0x5 );

    c_tmp.v = _mm256_mul_pd( a03.v  , b3.v    );
    c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v );
    c_tmp.v = _mm256_mul_pd( a47.v  , b3.v    );
    c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v );

    a += 8;
    b += 4;
  }
 

  // Prefetch aa and bb
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( aa ) );
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( bb ) );


  tmpc03_0.v = _mm256_blend_pd( c03_0.v, c03_1.v, 0x6 );
  tmpc03_1.v = _mm256_blend_pd( c03_1.v, c03_0.v, 0x6 );
  
  tmpc03_2.v = _mm256_blend_pd( c03_2.v, c03_3.v, 0x6 );
  tmpc03_3.v = _mm256_blend_pd( c03_3.v, c03_2.v, 0x6 );

  tmpc47_0.v = _mm256_blend_pd( c47_0.v, c47_1.v, 0x6 );
  tmpc47_1.v = _mm256_blend_pd( c47_1.v, c47_0.v, 0x6 );

  tmpc47_2.v = _mm256_blend_pd( c47_2.v, c47_3.v, 0x6 );
  tmpc47_3.v = _mm256_blend_pd( c47_3.v, c47_2.v, 0x6 );

  c03_0.v    = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x30 );
  c03_3.v    = _mm256_permute2f128_pd( tmpc03_2.v, tmpc03_0.v, 0x30 );

  c03_1.v    = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x30 );
  c03_2.v    = _mm256_permute2f128_pd( tmpc03_3.v, tmpc03_1.v, 0x30 );

  c47_0.v    = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x30 );
  c47_3.v    = _mm256_permute2f128_pd( tmpc47_2.v, tmpc47_0.v, 0x30 );

  c47_1.v    = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x30 );
  c47_2.v    = _mm256_permute2f128_pd( tmpc47_3.v, tmpc47_1.v, 0x30 );

  //printf( "rank-k\n" );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] );



  //for ( i = 0; i < k; i++ ) {
  //  a03.v = _mm256_load_pd(      (double*)a         );
  //  a47.v = _mm256_load_pd(      (double*)( a + 4 ) );
  //  b0.v  = _mm256_broadcast_sd( (double*)b         );
  //  b1.v  = _mm256_broadcast_sd( (double*)( b + 1 ) );
  //  b2.v  = _mm256_broadcast_sd( (double*)( b + 2 ) );
  //  b3.v  = _mm256_broadcast_sd( (double*)( b + 3 ) );

  //  a += DKS_MR;
  //  b += DKS_NR;

  //  c_tmp.v = _mm256_mul_pd( a03.v  , b0.v    );
  //  c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v );
  //  c_tmp.v = _mm256_mul_pd( a03.v  , b1.v    );
  //  c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v );
  //  c_tmp.v = _mm256_mul_pd( a03.v  , b2.v    );
  //  c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v );
  //  c_tmp.v = _mm256_mul_pd( a03.v  , b3.v    );
  //  c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v );

  //  c_tmp.v = _mm256_mul_pd( a47.v  , b0.v    );
  //  c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v );
  //  c_tmp.v = _mm256_mul_pd( a47.v  , b1.v    );
  //  c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v );
  //  c_tmp.v = _mm256_mul_pd( a47.v  , b2.v    );
  //  c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v );
  //  c_tmp.v = _mm256_mul_pd( a47.v  , b3.v    );
  //  c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v );
  //}
  
  aa_tmp.v = _mm256_broadcast_sd( &neg2 );
  //c03_0.v  = _mm256_mul_pd( aa_tmp.v, c03_0.v );
  //c03_1.v  = _mm256_mul_pd( aa_tmp.v, c03_1.v );
  //c03_2.v  = _mm256_mul_pd( aa_tmp.v, c03_2.v );
  //c03_3.v  = _mm256_mul_pd( aa_tmp.v, c03_3.v );
  //c47_0.v  = _mm256_mul_pd( aa_tmp.v, c47_0.v );
  //c47_1.v  = _mm256_mul_pd( aa_tmp.v, c47_1.v );
  //c47_2.v  = _mm256_mul_pd( aa_tmp.v, c47_2.v );
  //c47_3.v  = _mm256_mul_pd( aa_tmp.v, c47_3.v );
  //
  c03_0.v  = _mm256_mul_pd( aa_tmp.v, c03_0.v );
  c03_1.v  = _mm256_mul_pd( aa_tmp.v, c03_1.v );
  c03_2.v  = _mm256_mul_pd( aa_tmp.v, c03_2.v );
  c03_3.v  = _mm256_mul_pd( aa_tmp.v, c03_3.v );
  c47_0.v  = _mm256_mul_pd( aa_tmp.v, c47_0.v );
  c47_1.v  = _mm256_mul_pd( aa_tmp.v, c47_1.v );
  c47_2.v  = _mm256_mul_pd( aa_tmp.v, c47_2.v );
  c47_3.v  = _mm256_mul_pd( aa_tmp.v, c47_3.v );


  //printf( "scale -2 \n" );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] );


  aa_tmp.v = _mm256_load_pd( (double*)aa );
  c03_0.v  = _mm256_add_pd( aa_tmp.v, c03_0.v );
  c03_1.v  = _mm256_add_pd( aa_tmp.v, c03_1.v );
  c03_2.v  = _mm256_add_pd( aa_tmp.v, c03_2.v );
  c03_3.v  = _mm256_add_pd( aa_tmp.v, c03_3.v );

  //printf( "aa03 = %lf, %lf, %lf, %lf\n", aa_tmp.d[0], aa_tmp.d[1], aa_tmp.d[2], aa_tmp.d[3] );
  //printf( "bb03 = %lf, %lf, %lf, %lf\n", bb[ 0 ], bb[ 1 ], bb[ 2 ], bb[ 3 ] );

  aa_tmp.v = _mm256_load_pd( (double*)( aa + 4 ) );
  c47_0.v  = _mm256_add_pd( aa_tmp.v, c47_0.v );
  c47_1.v  = _mm256_add_pd( aa_tmp.v, c47_1.v );
  c47_2.v  = _mm256_add_pd( aa_tmp.v, c47_2.v );
  c47_3.v  = _mm256_add_pd( aa_tmp.v, c47_3.v );
  

  //printf( "add a^2\n" );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] );


  // Prefetch u
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( u ) );


  bb_tmp.v = _mm256_broadcast_sd( (double*)bb );
  c03_0.v  = _mm256_add_pd( bb_tmp.v, c03_0.v );
  c47_0.v  = _mm256_add_pd( bb_tmp.v, c47_0.v );

  bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 1 ) );
  c03_1.v  = _mm256_add_pd( bb_tmp.v, c03_1.v );
  c47_1.v  = _mm256_add_pd( bb_tmp.v, c47_1.v );

  bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 2 ) );
  c03_2.v  = _mm256_add_pd( bb_tmp.v, c03_2.v );
  c47_2.v  = _mm256_add_pd( bb_tmp.v, c47_2.v );

  bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 3 ) );
  c03_3.v  = _mm256_add_pd( bb_tmp.v, c03_3.v );
  c47_3.v  = _mm256_add_pd( bb_tmp.v, c47_3.v );



  // Check if there is any illegle value 
  c_tmp.v  = _mm256_broadcast_sd( &dzero );
  c03_0.v  = _mm256_max_pd( c_tmp.v, c03_0.v );
  c03_1.v  = _mm256_max_pd( c_tmp.v, c03_1.v );
  c03_2.v  = _mm256_max_pd( c_tmp.v, c03_2.v );
  c03_3.v  = _mm256_max_pd( c_tmp.v, c03_3.v );
  c47_0.v  = _mm256_max_pd( c_tmp.v, c47_0.v );
  c47_1.v  = _mm256_max_pd( c_tmp.v, c47_1.v );
  c47_2.v  = _mm256_max_pd( c_tmp.v, c47_2.v );
  c47_3.v  = _mm256_max_pd( c_tmp.v, c47_3.v );



  // Scale before the kernel evaluation
  aa_tmp.v = _mm256_broadcast_sd( &alpha );
  c03_0.v  = _mm256_mul_pd( aa_tmp.v, c03_0.v );
  c03_1.v  = _mm256_mul_pd( aa_tmp.v, c03_1.v );
  c03_2.v  = _mm256_mul_pd( aa_tmp.v, c03_2.v );
  c03_3.v  = _mm256_mul_pd( aa_tmp.v, c03_3.v );
  c47_0.v  = _mm256_mul_pd( aa_tmp.v, c47_0.v );
  c47_1.v  = _mm256_mul_pd( aa_tmp.v, c47_1.v );
  c47_2.v  = _mm256_mul_pd( aa_tmp.v, c47_2.v );
  c47_3.v  = _mm256_mul_pd( aa_tmp.v, c47_3.v );


  // Preload u03, u47
  u03.v    = _mm256_load_pd( (double*)u );
  u47.v    = _mm256_load_pd( (double*)( u + 4 ) );

  // Prefetch w
  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( w ) );


  #include "ks_exp_int_d8x4.h"

  //printf( "square distance\n" );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] );

  //for ( i = 0; i < 4; i++ ) {
  //  if ( c03_0.d[ i ] != c03_0.d[ i ] ) {
  //    printf( "error Nan: c03_0[ %d ]\n", i );
  //  }
  //  if ( c03_1.d[ i ] != c03_1.d[ i ] ) {
  //    printf( "error Nan: c03_1[ %d ]\n", i );
  //  }
  //  if ( c03_2.d[ i ] != c03_2.d[ i ] ) {
  //    printf( "error Nan: c03_2[ %d ]\n", i );
  //  }
  //  if ( c03_3.d[ i ] != c03_3.d[ i ] ) {
  //    printf( "error Nan: c03_3[ %d ]\n", i );
  //  }
  //  if ( c47_0.d[ i ] != c47_0.d[ i ] ) {
  //    printf( "error Nan: c47_0[ %d ]\n", i );
  //  }
  //  if ( c47_1.d[ i ] != c47_1.d[ i ] ) {
  //    printf( "error Nan: c47_1[ %d ]\n", i );
  //  }
  //  if ( c47_2.d[ i ] != c47_2.d[ i ] ) {
  //    printf( "error Nan: c47_2[ %d ]\n", i );
  //  }
  //  if ( c47_3.d[ i ] != c47_3.d[ i ] ) {
  //    printf( "error Nan: c47_3[ %d ]\n", i );
  //  }
  //}



//  tmp.v     = _mm256_broadcast_sd( &maxlog );
//  c03_0.v   = _mm256_min_pd( tmp.v, c03_0.v ); 
//  c03_1.v   = _mm256_min_pd( tmp.v, c03_1.v ); 
//  c03_2.v   = _mm256_min_pd( tmp.v, c03_2.v ); 
//  c03_3.v   = _mm256_min_pd( tmp.v, c03_3.v ); 
//  c47_0.v   = _mm256_min_pd( tmp.v, c47_0.v ); 
//  c47_1.v   = _mm256_min_pd( tmp.v, c47_1.v ); 
//  c47_2.v   = _mm256_min_pd( tmp.v, c47_2.v ); 
//  c47_3.v   = _mm256_min_pd( tmp.v, c47_3.v ); 
//  tmp.v     = _mm256_broadcast_sd( &minlog );
//  c03_0.v   = _mm256_max_pd( tmp.v, c03_0.v ); 
//  c03_1.v   = _mm256_max_pd( tmp.v, c03_1.v ); 
//  c03_2.v   = _mm256_max_pd( tmp.v, c03_2.v ); 
//  c03_3.v   = _mm256_max_pd( tmp.v, c03_3.v ); 
//  c47_0.v   = _mm256_max_pd( tmp.v, c47_0.v ); 
//  c47_1.v   = _mm256_max_pd( tmp.v, c47_1.v ); 
//  c47_2.v   = _mm256_max_pd( tmp.v, c47_2.v ); 
//  c47_3.v   = _mm256_max_pd( tmp.v, c47_3.v ); 
//
//  // a = c / log2e
//  // c = a * ln2 = k * ln2 + w, ( w in [ -ln2, ln2 ] )
//  l2e.v         = _mm256_broadcast_sd( &log2e );
//  a03_0.v       = _mm256_mul_pd( l2e.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( l2e.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( l2e.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( l2e.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( l2e.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( l2e.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( l2e.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( l2e.v, c47_3.v );
//
//  // Check if a < 0 
//  tmp.v         = _mm256_setzero_pd();
//  p03_0.v       = _mm256_cmp_pd( a03_0.v, tmp.v, 1 );
//  p03_1.v       = _mm256_cmp_pd( a03_1.v, tmp.v, 1 );
//  p03_2.v       = _mm256_cmp_pd( a03_2.v, tmp.v, 1 );
//  p03_3.v       = _mm256_cmp_pd( a03_3.v, tmp.v, 1 );
//  p47_0.v       = _mm256_cmp_pd( a47_0.v, tmp.v, 1 );
//  p47_1.v       = _mm256_cmp_pd( a47_1.v, tmp.v, 1 );
//  p47_2.v       = _mm256_cmp_pd( a47_2.v, tmp.v, 1 );
//  p47_3.v       = _mm256_cmp_pd( a47_3.v, tmp.v, 1 );
//  tmp.v         = _mm256_broadcast_sd( &one );
//  p03_0.v       = _mm256_and_pd( tmp.v, p03_0.v );
//  p03_1.v       = _mm256_and_pd( tmp.v, p03_1.v );
//  p03_2.v       = _mm256_and_pd( tmp.v, p03_2.v );
//  p03_3.v       = _mm256_and_pd( tmp.v, p03_3.v );
//  p47_0.v       = _mm256_and_pd( tmp.v, p47_0.v );
//  p47_1.v       = _mm256_and_pd( tmp.v, p47_1.v );
//  p47_2.v       = _mm256_and_pd( tmp.v, p47_2.v );
//  p47_3.v       = _mm256_and_pd( tmp.v, p47_3.v );
//  // If a < 0 ( w < 0 ), then a - 1 =  ( k - 1 ) + w / ln2 
//  a03_0.v       = _mm256_sub_pd( a03_0.v, p03_0.v );
//  a03_1.v       = _mm256_sub_pd( a03_1.v, p03_1.v );
//  a03_2.v       = _mm256_sub_pd( a03_2.v, p03_2.v );
//  a03_3.v       = _mm256_sub_pd( a03_3.v, p03_3.v );
//  a47_0.v       = _mm256_sub_pd( a47_0.v, p47_0.v );
//  a47_1.v       = _mm256_sub_pd( a47_1.v, p47_1.v );
//  a47_2.v       = _mm256_sub_pd( a47_2.v, p47_2.v );
//  a47_3.v       = _mm256_sub_pd( a47_3.v, p47_3.v );
//  // Compute floor( a ) by two conversions
//  // if a < 0, p = k - 1
//  // else    , p = k
//  k03_0.v       = _mm256_cvttpd_epi32( a03_0.v );
//  k03_1.v       = _mm256_cvttpd_epi32( a03_1.v );
//  k03_2.v       = _mm256_cvttpd_epi32( a03_2.v );
//  k03_3.v       = _mm256_cvttpd_epi32( a03_3.v );
//  k47_0.v       = _mm256_cvttpd_epi32( a47_0.v );
//  k47_1.v       = _mm256_cvttpd_epi32( a47_1.v );
//  k47_2.v       = _mm256_cvttpd_epi32( a47_2.v );
//  k47_3.v       = _mm256_cvttpd_epi32( a47_3.v );
//  p03_0.v       = _mm256_cvtepi32_pd( k03_0.v );
//  p03_1.v       = _mm256_cvtepi32_pd( k03_1.v );
//  p03_2.v       = _mm256_cvtepi32_pd( k03_2.v );
//  p03_3.v       = _mm256_cvtepi32_pd( k03_3.v );
//  p47_0.v       = _mm256_cvtepi32_pd( k47_0.v );
//  p47_1.v       = _mm256_cvtepi32_pd( k47_1.v );
//  p47_2.v       = _mm256_cvtepi32_pd( k47_2.v );
//  p47_3.v       = _mm256_cvtepi32_pd( k47_3.v );
//
//  // ---------------------
//  // x -= p * ln2
//  // ---------------------
//  // c1 = ln2
//  // if a < 0, a = ( k - 1 ) * ln2
//  // else    , a = k * ln2
//  // if a < 0, x -= ( k - 1 ) * ln2
//  // else    , x -= k * ln2
//  //
//  tmp.v         = _mm256_broadcast_sd( &c1 );
//  a03_0.v       = _mm256_mul_pd( tmp.v, p03_0.v );
//  a03_1.v       = _mm256_mul_pd( tmp.v, p03_1.v );
//  a03_2.v       = _mm256_mul_pd( tmp.v, p03_2.v );
//  a03_3.v       = _mm256_mul_pd( tmp.v, p03_3.v );
//  a47_0.v       = _mm256_mul_pd( tmp.v, p47_0.v );
//  a47_1.v       = _mm256_mul_pd( tmp.v, p47_1.v );
//  a47_2.v       = _mm256_mul_pd( tmp.v, p47_2.v );
//  a47_3.v       = _mm256_mul_pd( tmp.v, p47_3.v );
//  c03_0.v       = _mm256_sub_pd( c03_0.v, a03_0.v );
//  c03_1.v       = _mm256_sub_pd( c03_1.v, a03_1.v );
//  c03_2.v       = _mm256_sub_pd( c03_2.v, a03_2.v );
//  c03_3.v       = _mm256_sub_pd( c03_3.v, a03_3.v );
//  c47_0.v       = _mm256_sub_pd( c47_0.v, a47_0.v );
//  c47_1.v       = _mm256_sub_pd( c47_1.v, a47_1.v );
//  c47_2.v       = _mm256_sub_pd( c47_2.v, a47_2.v );
//  c47_3.v       = _mm256_sub_pd( c47_3.v, a47_3.v );
//  tmp.v         = _mm256_broadcast_sd( &c2 );
//  a03_0.v       = _mm256_mul_pd( tmp.v, p03_0.v );
//  a03_1.v       = _mm256_mul_pd( tmp.v, p03_1.v );
//  a03_2.v       = _mm256_mul_pd( tmp.v, p03_2.v );
//  a03_3.v       = _mm256_mul_pd( tmp.v, p03_3.v );
//  a47_0.v       = _mm256_mul_pd( tmp.v, p47_0.v );
//  a47_1.v       = _mm256_mul_pd( tmp.v, p47_1.v );
//  a47_2.v       = _mm256_mul_pd( tmp.v, p47_2.v );
//  a47_3.v       = _mm256_mul_pd( tmp.v, p47_3.v );
//  c03_0.v       = _mm256_sub_pd( c03_0.v, a03_0.v );
//  c03_1.v       = _mm256_sub_pd( c03_1.v, a03_1.v );
//  c03_2.v       = _mm256_sub_pd( c03_2.v, a03_2.v );
//  c03_3.v       = _mm256_sub_pd( c03_3.v, a03_3.v );
//  c47_0.v       = _mm256_sub_pd( c47_0.v, a47_0.v );
//  c47_1.v       = _mm256_sub_pd( c47_1.v, a47_1.v );
//  c47_2.v       = _mm256_sub_pd( c47_2.v, a47_2.v );
//  c47_3.v       = _mm256_sub_pd( c47_3.v, a47_3.v );
//
//
//  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] );
//  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] );
//  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] );
//  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] );
//  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] );
//  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] );
//  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] );
//  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] );
//
//
//  // Prefetch u
//  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( u ) );
//
//
//
//  // Compute e^x using polynomial approximation
//  // a = w10 + w11 * x
//  tmp.v         = _mm256_broadcast_sd( &w11 );
//  //tmp.v         = _mm256_broadcast_sd( &w9 );
//  a03_0.v       = _mm256_mul_pd( c03_0.v, tmp.v );
//  a03_1.v       = _mm256_mul_pd( c03_1.v, tmp.v );
//  a03_2.v       = _mm256_mul_pd( c03_2.v, tmp.v );
//  a03_3.v       = _mm256_mul_pd( c03_3.v, tmp.v );
//  a47_0.v       = _mm256_mul_pd( c47_0.v, tmp.v );
//  a47_1.v       = _mm256_mul_pd( c47_1.v, tmp.v );
//  a47_2.v       = _mm256_mul_pd( c47_2.v, tmp.v );
//  a47_3.v       = _mm256_mul_pd( c47_3.v, tmp.v );
//  tmp.v         = _mm256_broadcast_sd( &w10 );
//  //tmp.v         = _mm256_broadcast_sd( &w8 );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//
//
//  // a = w8 + ( w9 + ( w10 + w11 * x ) * x ) * x
//  tmp.v         = _mm256_broadcast_sd( &w9 );
//  a03_0.v       = _mm256_mul_pd( a03_0.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( a03_1.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( a03_2.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( a03_3.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( a47_0.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( a47_1.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( a47_2.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( a47_3.v, c47_3.v );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//  tmp.v         = _mm256_broadcast_sd( &w8 );
//  a03_0.v       = _mm256_mul_pd( a03_0.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( a03_1.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( a03_2.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( a03_3.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( a47_0.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( a47_1.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( a47_2.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( a47_3.v, c47_3.v );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//
//
//  tmp.v         = _mm256_broadcast_sd( &w7 );
//  a03_0.v       = _mm256_mul_pd( a03_0.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( a03_1.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( a03_2.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( a03_3.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( a47_0.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( a47_1.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( a47_2.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( a47_3.v, c47_3.v );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//  tmp.v         = _mm256_broadcast_sd( &w6 );
//  a03_0.v       = _mm256_mul_pd( a03_0.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( a03_1.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( a03_2.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( a03_3.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( a47_0.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( a47_1.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( a47_2.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( a47_3.v, c47_3.v );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//
//
//  tmp.v         = _mm256_broadcast_sd( &w5 );
//  a03_0.v       = _mm256_mul_pd( a03_0.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( a03_1.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( a03_2.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( a03_3.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( a47_0.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( a47_1.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( a47_2.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( a47_3.v, c47_3.v );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//  tmp.v         = _mm256_broadcast_sd( &w4 );
//  a03_0.v       = _mm256_mul_pd( a03_0.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( a03_1.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( a03_2.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( a03_3.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( a47_0.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( a47_1.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( a47_2.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( a47_3.v, c47_3.v );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//
//
//  // Prefetch w
//  __asm__ volatile( "prefetcht0 0(%0)    \n\t" : :"r"( w ) );
//  // Preload u03
//  u03.v    = _mm256_load_pd( (double*)u );
//
//
//  tmp.v         = _mm256_broadcast_sd( &w3 );
//  a03_0.v       = _mm256_mul_pd( a03_0.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( a03_1.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( a03_2.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( a03_3.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( a47_0.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( a47_1.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( a47_2.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( a47_3.v, c47_3.v );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//  tmp.v         = _mm256_broadcast_sd( &w2 );
//  a03_0.v       = _mm256_mul_pd( a03_0.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( a03_1.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( a03_2.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( a03_3.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( a47_0.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( a47_1.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( a47_2.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( a47_3.v, c47_3.v );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//
//
//  tmp.v         = _mm256_broadcast_sd( &w1 );
//  a03_0.v       = _mm256_mul_pd( a03_0.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( a03_1.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( a03_2.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( a03_3.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( a47_0.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( a47_1.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( a47_2.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( a47_3.v, c47_3.v );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//  tmp.v         = _mm256_broadcast_sd( &w0 );
//  a03_0.v       = _mm256_mul_pd( a03_0.v, c03_0.v );
//  a03_1.v       = _mm256_mul_pd( a03_1.v, c03_1.v );
//  a03_2.v       = _mm256_mul_pd( a03_2.v, c03_2.v );
//  a03_3.v       = _mm256_mul_pd( a03_3.v, c03_3.v );
//  a47_0.v       = _mm256_mul_pd( a47_0.v, c47_0.v );
//  a47_1.v       = _mm256_mul_pd( a47_1.v, c47_1.v );
//  a47_2.v       = _mm256_mul_pd( a47_2.v, c47_2.v );
//  a47_3.v       = _mm256_mul_pd( a47_3.v, c47_3.v );
//  a03_0.v       = _mm256_add_pd( a03_0.v, tmp.v );
//  a03_1.v       = _mm256_add_pd( a03_1.v, tmp.v );
//  a03_2.v       = _mm256_add_pd( a03_2.v, tmp.v );
//  a03_3.v       = _mm256_add_pd( a03_3.v, tmp.v );
//  a47_0.v       = _mm256_add_pd( a47_0.v, tmp.v );
//  a47_1.v       = _mm256_add_pd( a47_1.v, tmp.v );
//  a47_2.v       = _mm256_add_pd( a47_2.v, tmp.v );
//  a47_3.v       = _mm256_add_pd( a47_3.v, tmp.v );
//
//
//  // Preload u47
//  u47.v    = _mm256_load_pd( (double*)( u + 4 ) );
//
//
//  offset.v      = _mm_setr_epi32( 1023, 1023, 0, 0 );
//  k1.v          = _mm_set_epi32( 0, 0, k03_0.d[ 1 ], k03_0.d[ 0 ]);
//  k2.v          = _mm_set_epi32( 0, 0, k03_0.d[ 3 ], k03_0.d[ 2 ]);
//  k1.v          = _mm_add_epi32( k1.v, offset.v );
//  k2.v          = _mm_add_epi32( k2.v, offset.v );
//  k1.v          = _mm_slli_epi32( k1.v, 20 );
//  k2.v          = _mm_slli_epi32( k2.v, 20 );
//  k1.v          = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  k2.v          = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  p1            = _mm_castsi128_pd( k1.v );
//  p2            = _mm_castsi128_pd( k2.v );
//  p03_0.v       = _mm256_set_m128d( p2, p1 );
//  k1.v          = _mm_set_epi32( 0, 0, k03_1.d[ 1 ], k03_1.d[ 0 ]);
//  k2.v          = _mm_set_epi32( 0, 0, k03_1.d[ 3 ], k03_1.d[ 2 ]);
//  k1.v          = _mm_add_epi32( k1.v, offset.v );
//  k2.v          = _mm_add_epi32( k2.v, offset.v );
//  k1.v          = _mm_slli_epi32( k1.v, 20 );
//  k2.v          = _mm_slli_epi32( k2.v, 20 );
//  k1.v          = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  k2.v          = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  p1            = _mm_castsi128_pd( k1.v );
//  p2            = _mm_castsi128_pd( k2.v );
//  p03_1.v       = _mm256_set_m128d( p2, p1 );
//  k1.v          = _mm_set_epi32( 0, 0, k03_2.d[ 1 ], k03_2.d[ 0 ]);
//  k2.v          = _mm_set_epi32( 0, 0, k03_2.d[ 3 ], k03_2.d[ 2 ]);
//  k1.v          = _mm_add_epi32( k1.v, offset.v );
//  k2.v          = _mm_add_epi32( k2.v, offset.v );
//  k1.v          = _mm_slli_epi32( k1.v, 20 );
//  k2.v          = _mm_slli_epi32( k2.v, 20 );
//  k1.v          = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  k2.v          = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  p1            = _mm_castsi128_pd( k1.v );
//  p2            = _mm_castsi128_pd( k2.v );
//  p03_2.v       = _mm256_set_m128d( p2, p1 );
//  k1.v          = _mm_set_epi32( 0, 0, k03_3.d[ 1 ], k03_3.d[ 0 ]);
//  k2.v          = _mm_set_epi32( 0, 0, k03_3.d[ 3 ], k03_3.d[ 2 ]);
//  k1.v          = _mm_add_epi32( k1.v, offset.v );
//  k2.v          = _mm_add_epi32( k2.v, offset.v );
//  k1.v          = _mm_slli_epi32( k1.v, 20 );
//  k2.v          = _mm_slli_epi32( k2.v, 20 );
//  k1.v          = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  k2.v          = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  p1            = _mm_castsi128_pd( k1.v );
//  p2            = _mm_castsi128_pd( k2.v );
//  p03_3.v       = _mm256_set_m128d( p2, p1 );
//  k1.v          = _mm_set_epi32( 0, 0, k47_0.d[ 1 ], k47_0.d[ 0 ]);
//  k2.v          = _mm_set_epi32( 0, 0, k47_0.d[ 3 ], k47_0.d[ 2 ]);
//  k1.v          = _mm_add_epi32( k1.v, offset.v );
//  k2.v          = _mm_add_epi32( k2.v, offset.v );
//  k1.v          = _mm_slli_epi32( k1.v, 20 );
//  k2.v          = _mm_slli_epi32( k2.v, 20 );
//  k1.v          = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  k2.v          = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  p1            = _mm_castsi128_pd( k1.v );
//  p2            = _mm_castsi128_pd( k2.v );
//  p47_0.v       = _mm256_set_m128d( p2, p1 );
//  k1.v          = _mm_set_epi32( 0, 0, k47_1.d[ 1 ], k47_1.d[ 0 ]);
//  k2.v          = _mm_set_epi32( 0, 0, k47_1.d[ 3 ], k47_1.d[ 2 ]);
//  k1.v          = _mm_add_epi32( k1.v, offset.v );
//  k2.v          = _mm_add_epi32( k2.v, offset.v );
//  k1.v          = _mm_slli_epi32( k1.v, 20 );
//  k2.v          = _mm_slli_epi32( k2.v, 20 );
//  k1.v          = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  k2.v          = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  p1            = _mm_castsi128_pd( k1.v );
//  p2            = _mm_castsi128_pd( k2.v );
//  p47_1.v       = _mm256_set_m128d( p2, p1 );
//  k1.v          = _mm_set_epi32( 0, 0, k47_2.d[ 1 ], k47_2.d[ 0 ]);
//  k2.v          = _mm_set_epi32( 0, 0, k47_2.d[ 3 ], k47_2.d[ 2 ]);
//  k1.v          = _mm_add_epi32( k1.v, offset.v );
//  k2.v          = _mm_add_epi32( k2.v, offset.v );
//  k1.v          = _mm_slli_epi32( k1.v, 20 );
//  k2.v          = _mm_slli_epi32( k2.v, 20 );
//  k1.v          = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  k2.v          = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  p1            = _mm_castsi128_pd( k1.v );
//  p2            = _mm_castsi128_pd( k2.v );
//  p47_2.v       = _mm256_set_m128d( p2, p1 );
//  k1.v          = _mm_set_epi32( 0, 0, k47_3.d[ 1 ], k47_3.d[ 0 ]);
//  k2.v          = _mm_set_epi32( 0, 0, k47_3.d[ 3 ], k47_3.d[ 2 ]);
//  k1.v          = _mm_add_epi32( k1.v, offset.v );
//  k2.v          = _mm_add_epi32( k2.v, offset.v );
//  k1.v          = _mm_slli_epi32( k1.v, 20 );
//  k2.v          = _mm_slli_epi32( k2.v, 20 );
//  k1.v          = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  k2.v          = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) );
//  p1            = _mm_castsi128_pd( k1.v );
//  p2            = _mm_castsi128_pd( k2.v );
//  p47_3.v       = _mm256_set_m128d( p2, p1 );
//  
// 
//  //u03.v    = _mm256_load_pd( (double*)u );
//  //u47.v    = _mm256_load_pd( (double*)( u + 4 ) );
//
//
//  c03_0.v       = _mm256_mul_pd( a03_0.v, p03_0.v );
//  c03_1.v       = _mm256_mul_pd( a03_1.v, p03_1.v );
//  c03_2.v       = _mm256_mul_pd( a03_2.v, p03_2.v );
//  c03_3.v       = _mm256_mul_pd( a03_3.v, p03_3.v );
//  c47_0.v       = _mm256_mul_pd( a47_0.v, p47_0.v );
//  c47_1.v       = _mm256_mul_pd( a47_1.v, p47_1.v );
//  c47_2.v       = _mm256_mul_pd( a47_2.v, p47_2.v );
//  c47_3.v       = _mm256_mul_pd( a47_3.v, p47_3.v );



  //for ( i = 0; i < 4; i++ ) {
  //  if ( c03_0.d[ i ] != c03_0.d[ i ] ) {
  //    printf( "error exp Nan: c03_0[ %d ]\n", i );
  //  }
  //  if ( c03_1.d[ i ] != c03_1.d[ i ] ) {
  //    printf( "error exp Nan: c03_1[ %d ]\n", i );
  //  }
  //  if ( c03_2.d[ i ] != c03_2.d[ i ] ) {
  //    printf( "error exp Nan: c03_2[ %d ]\n", i );
  //  }
  //  if ( c03_3.d[ i ] != c03_3.d[ i ] ) {
  //    printf( "error exp Nan: c03_3[ %d ]\n", i );
  //  }
  //  if ( c47_0.d[ i ] != c47_0.d[ i ] ) {
  //    printf( "error exp Nan: c47_0[ %d ]\n", i );
  //  }
  //  if ( c47_1.d[ i ] != c47_1.d[ i ] ) {
  //    printf( "error exp Nan: c47_1[ %d ]\n", i );
  //  }
  //  if ( c47_2.d[ i ] != c47_2.d[ i ] ) {
  //    printf( "error exp Nan: c47_2[ %d ]\n", i );
  //  }
  //  if ( c47_3.d[ i ] != c47_3.d[ i ] ) {
  //    printf( "error exp Nan: c47_3[ %d ]\n", i );
  //  }
  //}




  //printf( "exp\n" );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] );
  //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] );

  //printf( "w\n" );
  //printf( "%lf, %lf, %lf, %lf\n", w[0], w[3], w[3], w[3] );


  //u03.v    = _mm256_load_pd( (double*)u );
  //u47.v    = _mm256_load_pd( (double*)( u + 4 ) );

  w_tmp.v  = _mm256_broadcast_sd( (double*)w );
  c03_0.v  = _mm256_mul_pd( w_tmp.v, c03_0.v );
  c47_0.v  = _mm256_mul_pd( w_tmp.v, c47_0.v );
  u03.v    = _mm256_add_pd( u03.v, c03_0.v );
  u47.v    = _mm256_add_pd( u47.v, c47_0.v );
 

  //for ( i = 0; i < 4; i++ ) {
  //  if ( w_tmp.d[ i ] != w_tmp.d[ i ] ) {
  //    printf( "error w_tmp Nan: w_tmp[ %d ]\n", i );
  //  }
  //}


  w_tmp.v  = _mm256_broadcast_sd( (double*)( w + 1 ) );
  c03_1.v  = _mm256_mul_pd( w_tmp.v, c03_1.v );
  c47_1.v  = _mm256_mul_pd( w_tmp.v, c47_1.v );
  u03.v    = _mm256_add_pd( u03.v, c03_1.v );
  u47.v    = _mm256_add_pd( u47.v, c47_1.v );


  //for ( i = 0; i < 4; i++ ) {
  //  if ( w_tmp.d[ i ] != w_tmp.d[ i ] ) {
  //    printf( "error w_tmp Nan: w_tmp[ %d ]\n", i );
  //  }
  //}

  w_tmp.v  = _mm256_broadcast_sd( (double*)( w + 2 ) );
  c03_2.v  = _mm256_mul_pd( w_tmp.v, c03_2.v );
  c47_2.v  = _mm256_mul_pd( w_tmp.v, c47_2.v );
  u03.v    = _mm256_add_pd( u03.v, c03_2.v );
  u47.v    = _mm256_add_pd( u47.v, c47_2.v );


  //for ( i = 0; i < 4; i++ ) {
  //  if ( w_tmp.d[ i ] != w_tmp.d[ i ] ) {
  //    printf( "error w_tmp Nan: w_tmp[ %d ]\n", i );
  //  }
  //}

  w_tmp.v  = _mm256_broadcast_sd( (double*)( w + 3 ) );
  c03_3.v  = _mm256_mul_pd( w_tmp.v, c03_3.v );
  c47_3.v  = _mm256_mul_pd( w_tmp.v, c47_3.v );
  u03.v    = _mm256_add_pd( u03.v, c03_3.v );
  u47.v    = _mm256_add_pd( u47.v, c47_3.v );


  //for ( i = 0; i < 4; i++ ) {
  //  if ( w_tmp.d[ i ] != w_tmp.d[ i ] ) {
  //    printf( "error w_tmp Nan: w_tmp[ %d ]\n", i );
  //  }
  //}



  _mm256_store_pd( (double*)u, u03.v );
  _mm256_store_pd( (double*)( u + 4 ), u47.v );


  //for ( i = 0; i < 4; i++ ) {
  //  if ( c03_0.d[ i ] != c03_0.d[ i ] ) {
  //    printf( "error gemv Nan: c03_0[ %d ]\n", i );
  //    exit( 1 );
  //  }
  //  if ( c03_1.d[ i ] != c03_1.d[ i ] ) {
  //    printf( "error gemv Nan: c03_1[ %d ]\n", i );
  //    exit( 1 );
  //  }
  //  if ( c03_2.d[ i ] != c03_2.d[ i ] ) {
  //    printf( "error gemv Nan: c03_2[ %d ]\n", i );
  //    exit( 1 );
  //  }
  //  if ( c03_3.d[ i ] != c03_3.d[ i ] ) {
  //    printf( "error gemv Nan: c03_3[ %d ]\n", i );
  //    exit( 1 );
  //  }
  //  if ( c47_0.d[ i ] != c47_0.d[ i ] ) {
  //    printf( "error gemv Nan: c47_0[ %d ]\n", i );
  //    exit( 1 );
  //  }
  //  if ( c47_1.d[ i ] != c47_1.d[ i ] ) {
  //    printf( "error gemv Nan: c47_1[ %d ]\n", i );
  //    exit( 1 );
  //  }
  //  if ( c47_2.d[ i ] != c47_2.d[ i ] ) {
  //    printf( "error gemv Nan: c47_2[ %d ]\n", i );
  //    exit( 1 );
  //  }
  //  if ( c47_3.d[ i ] != c47_3.d[ i ] ) {
  //    printf( "error gemv Nan: c47_3[ %d ]\n", i );
  //    exit( 1 );
  //  }
  //}


  //for ( i = 0; i < 4; i ++ ) {
  //  if ( w[ i ] != w[ i ] ) {
  //    printf( "GSKS error w Nan: w03[ %d ]\n", i );
  //  }
  //}


  //for ( i = 0; i < 4; i++ ) {
  //  if ( u03.d[ i ] != u03.d[ i ] ) {
  //    printf( "GSKS error u Nan: u03[ %d ]\n", i );
  //  }
  //  if ( u47.d[ i ] != u47.d[ i ] ) {
  //    printf( "GSKS error u Nan: u47[ %d ]\n", i );
  //  }
  //}



  //printf( "%lf\n", u03.d[0] );
  //printf( "%lf\n", u03.d[1] );
  //printf( "%lf\n", u03.d[2] );
  //printf( "%lf\n", u03.d[3] );
  //printf( "%lf\n", u47.d[0] );
  //printf( "%lf\n", u47.d[1] );
  //printf( "%lf\n", u47.d[2] );
  //printf( "%lf\n", u47.d[3] );
}