boost::optional<double> SimpleClean::FindPeakAVX(const double *image, size_t width, size_t height, size_t& x, size_t& y, size_t startY, size_t endY, size_t horizontalBorder, size_t verticalBorder) { double peakMax = std::numeric_limits<double>::min(); size_t peakIndex = 0; __m256d mPeakMax = _mm256_set1_pd(peakMax); size_t xiStart = horizontalBorder, xiEnd = width - horizontalBorder; size_t yiStart = std::max(startY, verticalBorder), yiEnd = std::min(endY, height - verticalBorder); if(xiEnd < xiStart) xiEnd = xiStart; if(yiEnd < yiStart) yiEnd = yiStart; for(size_t yi=yiStart; yi!=yiEnd; ++yi) { size_t index = yi*width + xiStart; const double* const endPtr = image + yi*width + xiEnd - 4; const double *i=image + index; for(; i<endPtr; i+=4) { __m256d val = _mm256_loadu_pd(i); if(AllowNegativeComponent) { __m256d negVal = _mm256_sub_pd(_mm256_set1_pd(0.0), val); val = _mm256_max_pd(val, negVal); } int mask = _mm256_movemask_pd(_mm256_cmp_pd(val, mPeakMax, _CMP_GT_OQ)); if(mask != 0) { for(size_t di=0; di!=4; ++di) { double value = i[di]; if(AllowNegativeComponent) value = std::fabs(value); if(value > peakMax) { peakIndex = index+di; peakMax = std::fabs(i[di]); mPeakMax = _mm256_set1_pd(peakMax); } } } index+=4; } for(; i!=endPtr+4; ++i) { double value = *i; if(AllowNegativeComponent) value = std::fabs(value); if(value > peakMax) { peakIndex = index; peakMax = std::fabs(*i); } ++index; } } x = peakIndex % width; y = peakIndex / width; return image[x + y*width]; }
__SIMDd _SIMD_max_pd(__SIMDd a, __SIMDd b) { #ifdef USE_SSE return _mm_max_pd(a,b); #elif defined USE_AVX return _mm256_max_pd(a,b); #elif defined USE_IBM return vec_max(a,b); #endif }
/*! * \brief Compute the maximum between each pair element of the given vectors */ ETL_STATIC_INLINE(avx_simd_double) max(avx_simd_double lhs, avx_simd_double rhs) { return _mm256_max_pd(lhs.value, rhs.value); }
// Process audio effects for 8 channels simultaneously: void processEffects(const vec8_i32 &inpSamples, vec8_i32 &outSamples, const long n) { // Extract int samples and convert to doubles: const vec4_d64 ds0 = _mm256_div_pd( _mm256_cvtepi32_pd(_mm256_extractf128_si256(inpSamples, 0)), _mm256_set1_pd((double)INT_MAX) ); const vec4_d64 ds1 = _mm256_div_pd( _mm256_cvtepi32_pd(_mm256_extractf128_si256(inpSamples, 1)), _mm256_set1_pd((double)INT_MAX) ); // Monitor input levels: fx.fi_monitor.levels[n + 0] = scalar_to_dBFS(ds0); fx.fi_monitor.levels[n + 1] = scalar_to_dBFS(ds1); vec4_d64 s0, s1; // f0_gain: { s0 = _mm256_mul_pd(ds0, fx.f0_gain.calc.gain[n + 0]); s1 = _mm256_mul_pd(ds1, fx.f0_gain.calc.gain[n + 1]); } // Monitor levels: fx.f0_output.levels[n + 0] = scalar_to_dBFS(s0); fx.f0_output.levels[n + 1] = scalar_to_dBFS(s1); // f1_compressor: { const vec4_dBFS l0 = scalar_to_dBFS_offs(s0); const vec4_dBFS l1 = scalar_to_dBFS_offs(s1); // over = s - thresh vec4_dB over0 = _mm256_sub_pd(l0, fx.f1_compressor.input.threshold[n + 0]); vec4_dB over1 = _mm256_sub_pd(l1, fx.f1_compressor.input.threshold[n + 1]); // over = if over < 0.0 then 0.0 else over; over0 = mm256_if_then_else(_mm256_cmp_pd(over0, _mm256_set1_pd(0.0), _CMP_LT_OQ), _mm256_set1_pd(0.0), over0); over1 = mm256_if_then_else(_mm256_cmp_pd(over1, _mm256_set1_pd(0.0), _CMP_LT_OQ), _mm256_set1_pd(0.0), over1); // over += DC_OFFSET over0 = _mm256_add_pd(over0, DC_OFFSET); over1 = _mm256_add_pd(over1, DC_OFFSET); // env = over + coef * ( env - over ) const vec4_dB attack_env0 = _mm256_add_pd(over0, _mm256_mul_pd(fx.f1_compressor.calc.attack_coef[n + 0], _mm256_sub_pd(fx.f1_compressor.state.env[n + 0], over0))); const vec4_dB attack_env1 = _mm256_add_pd(over1, _mm256_mul_pd(fx.f1_compressor.calc.attack_coef[n + 1], _mm256_sub_pd(fx.f1_compressor.state.env[n + 1], over1))); const vec4_dB release_env0 = _mm256_add_pd(over0, _mm256_mul_pd(fx.f1_compressor.calc.release_coef[n + 0], _mm256_sub_pd(fx.f1_compressor.state.env[n + 0], over0))); const vec4_dB release_env1 = _mm256_add_pd(over1, _mm256_mul_pd(fx.f1_compressor.calc.release_coef[n + 1], _mm256_sub_pd(fx.f1_compressor.state.env[n + 1], over1))); // env = if over > env then attack_env else release_env fx.f1_compressor.state.env[n + 0] = mm256_if_then_else(_mm256_cmp_pd(over0, fx.f1_compressor.state.env[n + 0], _CMP_GT_OQ), attack_env0, release_env0); fx.f1_compressor.state.env[n + 1] = mm256_if_then_else(_mm256_cmp_pd(over1, fx.f1_compressor.state.env[n + 1], _CMP_GT_OQ), attack_env1, release_env1); // over = env - DC_OFFSET over0 = _mm256_sub_pd(fx.f1_compressor.state.env[n + 0], DC_OFFSET); over1 = _mm256_sub_pd(fx.f1_compressor.state.env[n + 1], DC_OFFSET); // grdB = ( over * ( ratio - 1.0 ) ) vec4_dB gr0dB = _mm256_mul_pd(over0, fx.f1_compressor.calc.ratio_min_1[n + 0]); vec4_dB gr1dB = _mm256_mul_pd(over1, fx.f1_compressor.calc.ratio_min_1[n + 1]); // gr = dB_to_scalar(grdB) fx.f1_compressor.monitor.gain_reduction[n + 0] = dB_to_scalar(gr0dB); fx.f1_compressor.monitor.gain_reduction[n + 1] = dB_to_scalar(gr1dB); // Apply gain reduction to inputs: s0 = _mm256_mul_pd(s0, fx.f1_compressor.monitor.gain_reduction[n + 0]); s1 = _mm256_mul_pd(s1, fx.f1_compressor.monitor.gain_reduction[n + 1]); // Apply make-up gain: s0 = _mm256_mul_pd(s0, fx.f1_compressor.calc.gain[n + 0]); s1 = _mm256_mul_pd(s1, fx.f1_compressor.calc.gain[n + 1]); } // Monitor output levels: fx.fo_monitor.levels[n + 0] = scalar_to_dBFS(s0); fx.fo_monitor.levels[n + 1] = scalar_to_dBFS(s1); // TODO(jsd): Better limiter implementation! // Limit final samples: s0 = _mm256_max_pd(_mm256_min_pd(s0, _mm256_set1_pd((double)1.0)), _mm256_set1_pd((double)-1.0)); s1 = _mm256_max_pd(_mm256_min_pd(s1, _mm256_set1_pd((double)1.0)), _mm256_set1_pd((double)-1.0)); // Convert doubles back to 32-bit ints: s0 = _mm256_mul_pd(s0, _mm256_set1_pd((double)INT_MAX)); s1 = _mm256_mul_pd(s1, _mm256_set1_pd((double)INT_MAX)); const vec8_i32 os = _mm256_setr_m128i(_mm256_cvtpd_epi32(s0), _mm256_cvtpd_epi32(s1)); // Write outputs: _mm256_stream_si256(&outSamples, os); }
inline vector4d max(const vector4d& lhs, const vector4d& rhs) { return _mm256_max_pd(lhs, rhs); }
inline F64vec4 max(const F64vec4 &l, const F64vec4 &r) { return _mm256_max_pd(l, r); }
BI_FORCE_INLINE inline avx_double max(const avx_double x, const avx_double y) { avx_double res; res.packed = _mm256_max_pd(x.packed, y.packed); return res; }
void ks_multiquadratic_int_d8x4( int k, int rhs, double *u, double *aa, double *a, double *bb, double *b, double *w, double *c, ks_t *ker, aux_t *aux ) { int i, rhs_left; double neg2 = -2.0; double dzero = 0.0; double done = 1.0; double mdone = -1.0; double alpha = ( 3.0 / 4.0 ); double cons = ker->cons; v4df_t c03_0, c03_1, c03_2, c03_3; v4df_t c47_0, c47_1, c47_2, c47_3; v4df_t tmpc03_0, tmpc03_1, tmpc03_2, tmpc03_3; v4df_t tmpc47_0, tmpc47_1, tmpc47_2, tmpc47_3; v4df_t u03, u47; v4df_t a03, a47, A03, A47; // prefetched A v4df_t b0, b1, b2, b3, B0; // prefetched B v4df_t c_tmp, aa_tmp, bb_tmp, w_tmp; // Rank-k update segment #include "ks_rank_k_int_d8x4.h" __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( aa ) ); __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( bb ) ); // Accumulate if ( aux->pc ) { tmpc03_0.v = _mm256_load_pd( (double*)( c ) ); c03_0.v = _mm256_add_pd( tmpc03_0.v, c03_0.v ); tmpc47_0.v = _mm256_load_pd( (double*)( c + 4 ) ); c47_0.v = _mm256_add_pd( tmpc47_0.v, c47_0.v ); tmpc03_1.v = _mm256_load_pd( (double*)( c + 8 ) ); c03_1.v = _mm256_add_pd( tmpc03_1.v, c03_1.v ); tmpc47_1.v = _mm256_load_pd( (double*)( c + 12 ) ); c47_1.v = _mm256_add_pd( tmpc47_1.v, c47_1.v ); tmpc03_2.v = _mm256_load_pd( (double*)( c + 16 ) ); c03_2.v = _mm256_add_pd( tmpc03_2.v, c03_2.v ); tmpc47_2.v = _mm256_load_pd( (double*)( c + 20 ) ); c47_2.v = _mm256_add_pd( tmpc47_2.v, c47_2.v ); tmpc03_3.v = _mm256_load_pd( (double*)( c + 24 ) ); c03_3.v = _mm256_add_pd( tmpc03_3.v, c03_3.v ); tmpc47_3.v = _mm256_load_pd( (double*)( c + 28 ) ); c47_3.v = _mm256_add_pd( tmpc47_3.v, c47_3.v ); } // Scale -2 aa_tmp.v = _mm256_broadcast_sd( &neg2 ); c03_0.v = _mm256_mul_pd( aa_tmp.v, c03_0.v ); c03_1.v = _mm256_mul_pd( aa_tmp.v, c03_1.v ); c03_2.v = _mm256_mul_pd( aa_tmp.v, c03_2.v ); c03_3.v = _mm256_mul_pd( aa_tmp.v, c03_3.v ); c47_0.v = _mm256_mul_pd( aa_tmp.v, c47_0.v ); c47_1.v = _mm256_mul_pd( aa_tmp.v, c47_1.v ); c47_2.v = _mm256_mul_pd( aa_tmp.v, c47_2.v ); c47_3.v = _mm256_mul_pd( aa_tmp.v, c47_3.v ); aa_tmp.v = _mm256_load_pd( (double*)aa ); c03_0.v = _mm256_add_pd( aa_tmp.v, c03_0.v ); c03_1.v = _mm256_add_pd( aa_tmp.v, c03_1.v ); c03_2.v = _mm256_add_pd( aa_tmp.v, c03_2.v ); c03_3.v = _mm256_add_pd( aa_tmp.v, c03_3.v ); aa_tmp.v = _mm256_load_pd( (double*)( aa + 4 ) ); c47_0.v = _mm256_add_pd( aa_tmp.v, c47_0.v ); c47_1.v = _mm256_add_pd( aa_tmp.v, c47_1.v ); c47_2.v = _mm256_add_pd( aa_tmp.v, c47_2.v ); c47_3.v = _mm256_add_pd( aa_tmp.v, c47_3.v ); bb_tmp.v = _mm256_broadcast_sd( (double*)bb ); c03_0.v = _mm256_add_pd( bb_tmp.v, c03_0.v ); c47_0.v = _mm256_add_pd( bb_tmp.v, c47_0.v ); bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 1 ) ); c03_1.v = _mm256_add_pd( bb_tmp.v, c03_1.v ); c47_1.v = _mm256_add_pd( bb_tmp.v, c47_1.v ); bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 2 ) ); c03_2.v = _mm256_add_pd( bb_tmp.v, c03_2.v ); c47_2.v = _mm256_add_pd( bb_tmp.v, c47_2.v ); bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 3 ) ); c03_3.v = _mm256_add_pd( bb_tmp.v, c03_3.v ); c47_3.v = _mm256_add_pd( bb_tmp.v, c47_3.v ); // Check if there is any illegle value c_tmp.v = _mm256_broadcast_sd( &dzero ); c03_0.v = _mm256_max_pd( c_tmp.v, c03_0.v ); c03_1.v = _mm256_max_pd( c_tmp.v, c03_1.v ); c03_2.v = _mm256_max_pd( c_tmp.v, c03_2.v ); c03_3.v = _mm256_max_pd( c_tmp.v, c03_3.v ); c47_0.v = _mm256_max_pd( c_tmp.v, c47_0.v ); c47_1.v = _mm256_max_pd( c_tmp.v, c47_1.v ); c47_2.v = _mm256_max_pd( c_tmp.v, c47_2.v ); c47_3.v = _mm256_max_pd( c_tmp.v, c47_3.v ); // Preload u03, u47 u03.v = _mm256_load_pd( (double*)u ); u47.v = _mm256_load_pd( (double*)( u + 4 ) ); // Prefetch u and w __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( u + 8 ) ); __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( w ) ); // c = c + cons c_tmp.v = _mm256_broadcast_sd( &cons ); c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); // Multiple rhs kernel summation. #include "ks_kernel_summation_int_d8x4.h" }
void rnn_int_d8x4_var2( int k, double *aa, double *a, double *bb, double *b, double *c, aux_t *aux ) { int i; double neg2 = -2.0; double dzero = 0.0; v4df_t c03_0, c03_1, c03_2, c03_3; v4df_t c47_0, c47_1, c47_2, c47_3; v4df_t tmpc03_0, tmpc03_1, tmpc03_2, tmpc03_3; v4df_t tmpc47_0, tmpc47_1, tmpc47_2, tmpc47_3; v4df_t c_tmp; v4df_t a03, a47; v4df_t A03, A47; // prefetched A v4df_t b0, b1, b2, b3; v4df_t B0; // prefetched B v4df_t aa_tmp, bb_tmp; int k_iter = k / 2; int k_left = k % 2; __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( a ) ); __asm__ volatile( "prefetcht2 0(%0) \n\t" : :"r"( aux->b_next ) ); __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( c ) ); c03_0.v = _mm256_setzero_pd(); c03_1.v = _mm256_setzero_pd(); c03_2.v = _mm256_setzero_pd(); c03_3.v = _mm256_setzero_pd(); c47_0.v = _mm256_setzero_pd(); c47_1.v = _mm256_setzero_pd(); c47_2.v = _mm256_setzero_pd(); c47_3.v = _mm256_setzero_pd(); // Load a03 a03.v = _mm256_load_pd( (double*)a ); // Load a47 a47.v = _mm256_load_pd( (double*)( a + 4 ) ); // Load (b0,b1,b2,b3) b0.v = _mm256_load_pd( (double*)b ); for ( i = 0; i < k_iter; ++i ) { __asm__ volatile( "prefetcht0 192(%0) \n\t" : :"r"(a) ); // Preload A03 A03.v = _mm256_load_pd( (double*)( a + 8 ) ); c_tmp.v = _mm256_mul_pd( a03.v , b0.v ); c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); c_tmp.v = _mm256_mul_pd( a47.v , b0.v ); c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); // Preload A47 A47.v = _mm256_load_pd( (double*)( a + 12 ) ); // Shuffle b ( 1, 0, 3, 2 ) b1.v = _mm256_shuffle_pd( b0.v, b0.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b1.v ); c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); c_tmp.v = _mm256_mul_pd( a47.v , b1.v ); c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); // Permute b ( 3, 2, 1, 0 ) b2.v = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 ); // Preload B0 B0.v = _mm256_load_pd( (double*)( b + 4 ) ); c_tmp.v = _mm256_mul_pd( a03.v , b2.v ); c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); c_tmp.v = _mm256_mul_pd( a47.v , b2.v ); c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // Shuffle b ( 3, 2, 1, 0 ) b3.v = _mm256_shuffle_pd( b2.v, b2.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b3.v ); c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); c_tmp.v = _mm256_mul_pd( a47.v , b3.v ); c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); // Iteration #1 __asm__ volatile( "prefetcht0 512(%0) \n\t" : :"r"(a) ); // Preload a03 ( next iteration ) a03.v = _mm256_load_pd( (double*)( a + 16 ) ); c_tmp.v = _mm256_mul_pd( A03.v , B0.v ); c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); b1.v = _mm256_shuffle_pd( B0.v, B0.v, 0x5 ); c_tmp.v = _mm256_mul_pd( A47.v , B0.v ); c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); c_tmp.v = _mm256_mul_pd( A03.v , b1.v ); c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); // Preload a47 ( next iteration ) a47.v = _mm256_load_pd( (double*)( a + 20 ) ); // Permute b ( 3, 2, 1, 0 ) b2.v = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 ); c_tmp.v = _mm256_mul_pd( A47.v , b1.v ); c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); c_tmp.v = _mm256_mul_pd( A03.v , b2.v ); c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); // Shuffle b ( 3, 2, 1, 0 ) b3.v = _mm256_shuffle_pd( b2.v, b2.v, 0x5 ); c_tmp.v = _mm256_mul_pd( A47.v , b2.v ); c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // Load b0 ( next iteration ) b0.v = _mm256_load_pd( (double*)( b + 8 ) ); c_tmp.v = _mm256_mul_pd( A03.v , b3.v ); c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); c_tmp.v = _mm256_mul_pd( A47.v , b3.v ); c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); a += 16; b += 8; } for ( i = 0; i < k_left; ++i ) { a03.v = _mm256_load_pd( (double*)a ); //printf( "a03 = %lf, %lf, %lf, %lf\n", a03.d[0], a03.d[1], a03.d[2], a03.d[3] ); a47.v = _mm256_load_pd( (double*)( a + 4 ) ); //printf( "a47 = %lf, %lf, %lf, %lf\n", a47.d[0], a47.d[1], a47.d[2], a47.d[3] ); b0.v = _mm256_load_pd( (double*)b ); //printf( "b0 = %lf, %lf, %lf, %lf\n", b0.d[0], b0.d[1], b0.d[2], b0.d[3] ); c_tmp.v = _mm256_mul_pd( a03.v , b0.v ); c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); c_tmp.v = _mm256_mul_pd( a47.v , b0.v ); c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); // Shuffle b ( 1, 0, 3, 2 ) b1.v = _mm256_shuffle_pd( b0.v, b0.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b1.v ); c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); c_tmp.v = _mm256_mul_pd( a47.v , b1.v ); c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); // Permute b ( 3, 2, 1, 0 ) b2.v = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 ); c_tmp.v = _mm256_mul_pd( a03.v , b2.v ); c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); c_tmp.v = _mm256_mul_pd( a47.v , b2.v ); c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // Shuffle b ( 3, 2, 1, 0 ) b3.v = _mm256_shuffle_pd( b2.v, b2.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b3.v ); c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); c_tmp.v = _mm256_mul_pd( a47.v , b3.v ); c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); a += 8; b += 4; } // Prefetch aa and bb __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( aa ) ); __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( bb ) ); tmpc03_0.v = _mm256_blend_pd( c03_0.v, c03_1.v, 0x6 ); tmpc03_1.v = _mm256_blend_pd( c03_1.v, c03_0.v, 0x6 ); tmpc03_2.v = _mm256_blend_pd( c03_2.v, c03_3.v, 0x6 ); tmpc03_3.v = _mm256_blend_pd( c03_3.v, c03_2.v, 0x6 ); tmpc47_0.v = _mm256_blend_pd( c47_0.v, c47_1.v, 0x6 ); tmpc47_1.v = _mm256_blend_pd( c47_1.v, c47_0.v, 0x6 ); tmpc47_2.v = _mm256_blend_pd( c47_2.v, c47_3.v, 0x6 ); tmpc47_3.v = _mm256_blend_pd( c47_3.v, c47_2.v, 0x6 ); c03_0.v = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x30 ); c03_3.v = _mm256_permute2f128_pd( tmpc03_2.v, tmpc03_0.v, 0x30 ); c03_1.v = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x30 ); c03_2.v = _mm256_permute2f128_pd( tmpc03_3.v, tmpc03_1.v, 0x30 ); c47_0.v = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x30 ); c47_3.v = _mm256_permute2f128_pd( tmpc47_2.v, tmpc47_0.v, 0x30 ); c47_1.v = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x30 ); c47_2.v = _mm256_permute2f128_pd( tmpc47_3.v, tmpc47_1.v, 0x30 ); //printf( "rank-k\n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( aux->I ) ); __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( aux->D ) ); //for ( i = 0; i < k; i++ ) { // a03.v = _mm256_load_pd( (double*)a ); // a47.v = _mm256_load_pd( (double*)( a + 4 ) ); // b0.v = _mm256_broadcast_sd( (double*)b ); // b1.v = _mm256_broadcast_sd( (double*)( b + 1 ) ); // b2.v = _mm256_broadcast_sd( (double*)( b + 2 ) ); // b3.v = _mm256_broadcast_sd( (double*)( b + 3 ) ); // a += DKS_MR; // b += DKS_NR; // c_tmp.v = _mm256_mul_pd( a03.v , b0.v ); // c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); // c_tmp.v = _mm256_mul_pd( a03.v , b1.v ); // c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); // c_tmp.v = _mm256_mul_pd( a03.v , b2.v ); // c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); // c_tmp.v = _mm256_mul_pd( a03.v , b3.v ); // c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b0.v ); // c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b1.v ); // c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b2.v ); // c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b3.v ); // c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); //} aa_tmp.v = _mm256_broadcast_sd( &neg2 ); //c03_0.v = _mm256_mul_pd( aa_tmp.v, c03_0.v ); //c03_1.v = _mm256_mul_pd( aa_tmp.v, c03_1.v ); //c03_2.v = _mm256_mul_pd( aa_tmp.v, c03_2.v ); //c03_3.v = _mm256_mul_pd( aa_tmp.v, c03_3.v ); //c47_0.v = _mm256_mul_pd( aa_tmp.v, c47_0.v ); //c47_1.v = _mm256_mul_pd( aa_tmp.v, c47_1.v ); //c47_2.v = _mm256_mul_pd( aa_tmp.v, c47_2.v ); //c47_3.v = _mm256_mul_pd( aa_tmp.v, c47_3.v ); // c03_0.v = _mm256_mul_pd( aa_tmp.v, c03_0.v ); c03_1.v = _mm256_mul_pd( aa_tmp.v, c03_1.v ); c03_2.v = _mm256_mul_pd( aa_tmp.v, c03_2.v ); c03_3.v = _mm256_mul_pd( aa_tmp.v, c03_3.v ); c47_0.v = _mm256_mul_pd( aa_tmp.v, c47_0.v ); c47_1.v = _mm256_mul_pd( aa_tmp.v, c47_1.v ); c47_2.v = _mm256_mul_pd( aa_tmp.v, c47_2.v ); c47_3.v = _mm256_mul_pd( aa_tmp.v, c47_3.v ); //printf( "scale -2 \n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); aa_tmp.v = _mm256_load_pd( (double*)aa ); c03_0.v = _mm256_add_pd( aa_tmp.v, c03_0.v ); c03_1.v = _mm256_add_pd( aa_tmp.v, c03_1.v ); c03_2.v = _mm256_add_pd( aa_tmp.v, c03_2.v ); c03_3.v = _mm256_add_pd( aa_tmp.v, c03_3.v ); //printf( "aa03 = %lf, %lf, %lf, %lf\n", aa_tmp.d[0], aa_tmp.d[1], aa_tmp.d[2], aa_tmp.d[3] ); //printf( "bb03 = %lf, %lf, %lf, %lf\n", bb[ 0 ], bb[ 1 ], bb[ 2 ], bb[ 3 ] ); aa_tmp.v = _mm256_load_pd( (double*)( aa + 4 ) ); c47_0.v = _mm256_add_pd( aa_tmp.v, c47_0.v ); c47_1.v = _mm256_add_pd( aa_tmp.v, c47_1.v ); c47_2.v = _mm256_add_pd( aa_tmp.v, c47_2.v ); c47_3.v = _mm256_add_pd( aa_tmp.v, c47_3.v ); //printf( "add a^2\n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); bb_tmp.v = _mm256_broadcast_sd( (double*)bb ); c03_0.v = _mm256_add_pd( bb_tmp.v, c03_0.v ); c47_0.v = _mm256_add_pd( bb_tmp.v, c47_0.v ); bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 1 ) ); c03_1.v = _mm256_add_pd( bb_tmp.v, c03_1.v ); c47_1.v = _mm256_add_pd( bb_tmp.v, c47_1.v ); bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 2 ) ); c03_2.v = _mm256_add_pd( bb_tmp.v, c03_2.v ); c47_2.v = _mm256_add_pd( bb_tmp.v, c47_2.v ); bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 3 ) ); c03_3.v = _mm256_add_pd( bb_tmp.v, c03_3.v ); c47_3.v = _mm256_add_pd( bb_tmp.v, c47_3.v ); // Check if there is any illegle value c_tmp.v = _mm256_broadcast_sd( &dzero ); c03_0.v = _mm256_max_pd( c_tmp.v, c03_0.v ); c03_1.v = _mm256_max_pd( c_tmp.v, c03_1.v ); c03_2.v = _mm256_max_pd( c_tmp.v, c03_2.v ); c03_3.v = _mm256_max_pd( c_tmp.v, c03_3.v ); c47_0.v = _mm256_max_pd( c_tmp.v, c47_0.v ); c47_1.v = _mm256_max_pd( c_tmp.v, c47_1.v ); c47_2.v = _mm256_max_pd( c_tmp.v, c47_2.v ); c47_3.v = _mm256_max_pd( c_tmp.v, c47_3.v ); // Transpose c03/c47 _0, _1, _2, _3 to be the row vector tmpc03_0.v = _mm256_shuffle_pd( c03_0.v, c03_1.v, 0x0 ); tmpc03_1.v = _mm256_shuffle_pd( c03_0.v, c03_1.v, 0xF ); tmpc03_2.v = _mm256_shuffle_pd( c03_2.v, c03_3.v, 0x0 ); tmpc03_3.v = _mm256_shuffle_pd( c03_2.v, c03_3.v, 0xF ); tmpc47_0.v = _mm256_shuffle_pd( c47_0.v, c47_1.v, 0x0 ); tmpc47_1.v = _mm256_shuffle_pd( c47_0.v, c47_1.v, 0xF ); tmpc47_2.v = _mm256_shuffle_pd( c47_2.v, c47_3.v, 0x0 ); tmpc47_3.v = _mm256_shuffle_pd( c47_2.v, c47_3.v, 0xF ); c03_0.v = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x20 ); c03_2.v = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x31 ); c03_1.v = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x20 ); c03_3.v = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x31 ); c47_0.v = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x20 ); c47_2.v = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x31 ); c47_1.v = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x20 ); c47_3.v = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x31 ); // c03_0; // c03_1; // c03_2; // c03_3; // c47_0; // c47_1; // c47_2; // c47_3; _mm256_store_pd( c , c03_0.v ); _mm256_store_pd( c + 4, c03_1.v ); _mm256_store_pd( c + 8, c03_2.v ); _mm256_store_pd( c + 12, c03_3.v ); _mm256_store_pd( c + 16, c47_0.v ); _mm256_store_pd( c + 20, c47_1.v ); _mm256_store_pd( c + 24, c47_2.v ); _mm256_store_pd( c + 28, c47_3.v ); }
void ks_gaussian_int_d8x4( int k, double alpha, double *u, double *aa, double *a, double *bb, double *b, double *w, aux_t *aux ) { int i; double neg2 = -2.0; double dzero = 0.0; v4df_t c03_0, c03_1, c03_2, c03_3; v4df_t c47_0, c47_1, c47_2, c47_3; v4df_t tmpc03_0, tmpc03_1, tmpc03_2, tmpc03_3; v4df_t tmpc47_0, tmpc47_1, tmpc47_2, tmpc47_3; v4df_t c_tmp; v4df_t u03; v4df_t u47; v4df_t a03, a47; v4df_t A03, A47; // prefetched A v4df_t b0, b1, b2, b3; v4df_t B0; // prefetched B v4df_t aa_tmp, bb_tmp; v4df_t w_tmp; //// Inline vdExp() //const double log2e = 1.4426950408889634073599; //const double maxlog = 7.09782712893383996843e2; // log( 2**1024 ) //const double minlog = -7.08396418532264106224e2; // log( 2**-1024 ) //const double one = 1.0; //const double c1 = 6.93145751953125E-1; //const double c2 = 1.42860682030941723212E-6; //// Original Remez Order 11 coefficients //const double w11 = 3.5524625185478232665958141148891055719216674475023e-8; //const double w10 = 2.5535368519306500343384723775435166753084614063349e-7; //const double w9 = 2.77750562801295315877005242757916081614772210463065e-6; //const double w8 = 2.47868893393199945541176652007657202642495832996107e-5; //const double w7 = 1.98419213985637881240770890090795533564573406893163e-4; //const double w6 = 1.3888869684178659239014256260881685824525255547326e-3; //const double w5 = 8.3333337052009872221152811550156335074160546333973e-3; //const double w4 = 4.1666666621080810610346717440523105184720007971655e-2; //const double w3 = 0.166666666669960803484477734308515404418108830469798; //const double w2 = 0.499999999999877094481580370323249951329122224389189; //const double w1 = 1.0000000000000017952745258419615282194236357388884; //const double w0 = 0.99999999999999999566016490920259318691496540598896; // Remez Order 11 polynomail approximation //const double w0 = 9.9999999999999999694541216787022234814339814028865e-1; //const double w1 = 1.0000000000000013347525109964212249781265243645457; //const double w2 = 4.9999999999990426011279542064313207349934058355357e-1; //const double w3 = 1.6666666666933781279020916199156875162816850273886e-1; //const double w4 = 4.1666666628388978913396218847247771982698350546174e-2; //const double w5 = 8.3333336552944126722390410619859929515740995889372e-3; //const double w6 = 1.3888871805082296012945081624687544823497126781709e-3; //const double w7 = 1.9841863599469418342286677256362193951266072398489e-4; //const double w8 = 2.4787899938611697691690479138150629377630767114546e-5; //const double w9 = 2.7764095757136528235740765949934667970688427190168e-6; //const double w10 = 2.5602485412126369546033948405199058329040797134573e-7; //const double w11 = 3.5347283721656121939634391175390704621351283546671e-8; // Remez Order 9 polynomail approximation // const double w0 = 9.9999999999998657717890998293462356769270934668652e-1; // const double w1 = 1.0000000000041078023971691258305486059867172736079; // const double w2 = 4.9999999979496223000111361187419539211772440139043e-1; // const double w3 = 1.6666667059968250851708016603646727895353772273675e-1; // const double w4 = 4.1666628655740875994884332519499013211594753124142e-2; // const double w5 = 8.3335428149736685441705398632467122758546893330069e-3; // const double w6 = 1.3881912931358424526285652289974115047170651985345e-3; // const double w7 = 1.9983735415194021112767942931416179152416729204150e-4; // const double w8 = 2.3068467290270483679711135625155862511780587976925e-5; // const double w9 = 3.8865682386514872192656192137071689334005518164704e-6; //v4df_t a03_0, a03_1, a03_2, a03_3; //v4df_t a47_0, a47_1, a47_2, a47_3; //v4df_t p03_0, p03_1, p03_2, p03_3; //v4df_t p47_0, p47_1, p47_2, p47_3; //v4df_t y, l2e, tmp, p; //v4li_t k03_0, k03_1, k03_2, k03_3; //v4li_t k47_0, k47_1, k47_2, k47_3; //v4li_t offset; //v4li_t k1, k2; //__m128d p1, p2; int k_iter = k / 2; int k_left = k % 2; __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( a ) ); __asm__ volatile( "prefetcht2 0(%0) \n\t" : :"r"( aux->b_next ) ); c03_0.v = _mm256_setzero_pd(); c03_1.v = _mm256_setzero_pd(); c03_2.v = _mm256_setzero_pd(); c03_3.v = _mm256_setzero_pd(); c47_0.v = _mm256_setzero_pd(); c47_1.v = _mm256_setzero_pd(); c47_2.v = _mm256_setzero_pd(); c47_3.v = _mm256_setzero_pd(); // Load a03 a03.v = _mm256_load_pd( (double*)a ); // Load a47 a47.v = _mm256_load_pd( (double*)( a + 4 ) ); // Load (b0,b1,b2,b3) b0.v = _mm256_load_pd( (double*)b ); for ( i = 0; i < k_iter; ++i ) { __asm__ volatile( "prefetcht0 192(%0) \n\t" : :"r"(a) ); // Preload A03 A03.v = _mm256_load_pd( (double*)( a + 8 ) ); c_tmp.v = _mm256_mul_pd( a03.v , b0.v ); c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); c_tmp.v = _mm256_mul_pd( a47.v , b0.v ); c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); // Preload A47 A47.v = _mm256_load_pd( (double*)( a + 12 ) ); // Shuffle b ( 1, 0, 3, 2 ) b1.v = _mm256_shuffle_pd( b0.v, b0.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b1.v ); c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); c_tmp.v = _mm256_mul_pd( a47.v , b1.v ); c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); // Permute b ( 3, 2, 1, 0 ) b2.v = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 ); // Preload B0 B0.v = _mm256_load_pd( (double*)( b + 4 ) ); c_tmp.v = _mm256_mul_pd( a03.v , b2.v ); c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); c_tmp.v = _mm256_mul_pd( a47.v , b2.v ); c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // Shuffle b ( 3, 2, 1, 0 ) b3.v = _mm256_shuffle_pd( b2.v, b2.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b3.v ); c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); c_tmp.v = _mm256_mul_pd( a47.v , b3.v ); c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); // Iteration #1 __asm__ volatile( "prefetcht0 512(%0) \n\t" : :"r"(a) ); // Preload a03 ( next iteration ) a03.v = _mm256_load_pd( (double*)( a + 16 ) ); c_tmp.v = _mm256_mul_pd( A03.v , B0.v ); c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); b1.v = _mm256_shuffle_pd( B0.v, B0.v, 0x5 ); c_tmp.v = _mm256_mul_pd( A47.v , B0.v ); c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); c_tmp.v = _mm256_mul_pd( A03.v , b1.v ); c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); // Preload a47 ( next iteration ) a47.v = _mm256_load_pd( (double*)( a + 20 ) ); // Permute b ( 3, 2, 1, 0 ) b2.v = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 ); c_tmp.v = _mm256_mul_pd( A47.v , b1.v ); c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); c_tmp.v = _mm256_mul_pd( A03.v , b2.v ); c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); // Shuffle b ( 3, 2, 1, 0 ) b3.v = _mm256_shuffle_pd( b2.v, b2.v, 0x5 ); c_tmp.v = _mm256_mul_pd( A47.v , b2.v ); c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // Load b0 ( next iteration ) b0.v = _mm256_load_pd( (double*)( b + 8 ) ); c_tmp.v = _mm256_mul_pd( A03.v , b3.v ); c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); c_tmp.v = _mm256_mul_pd( A47.v , b3.v ); c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); a += 16; b += 8; } for ( i = 0; i < k_left; ++i ) { a03.v = _mm256_load_pd( (double*)a ); //printf( "a03 = %lf, %lf, %lf, %lf\n", a03.d[0], a03.d[1], a03.d[2], a03.d[3] ); a47.v = _mm256_load_pd( (double*)( a + 4 ) ); //printf( "a47 = %lf, %lf, %lf, %lf\n", a47.d[0], a47.d[1], a47.d[2], a47.d[3] ); b0.v = _mm256_load_pd( (double*)b ); //printf( "b0 = %lf, %lf, %lf, %lf\n", b0.d[0], b0.d[1], b0.d[2], b0.d[3] ); c_tmp.v = _mm256_mul_pd( a03.v , b0.v ); c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); c_tmp.v = _mm256_mul_pd( a47.v , b0.v ); c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); // Shuffle b ( 1, 0, 3, 2 ) b1.v = _mm256_shuffle_pd( b0.v, b0.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b1.v ); c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); c_tmp.v = _mm256_mul_pd( a47.v , b1.v ); c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); // Permute b ( 3, 2, 1, 0 ) b2.v = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 ); c_tmp.v = _mm256_mul_pd( a03.v , b2.v ); c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); c_tmp.v = _mm256_mul_pd( a47.v , b2.v ); c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // Shuffle b ( 3, 2, 1, 0 ) b3.v = _mm256_shuffle_pd( b2.v, b2.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b3.v ); c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); c_tmp.v = _mm256_mul_pd( a47.v , b3.v ); c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); a += 8; b += 4; } // Prefetch aa and bb __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( aa ) ); __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( bb ) ); tmpc03_0.v = _mm256_blend_pd( c03_0.v, c03_1.v, 0x6 ); tmpc03_1.v = _mm256_blend_pd( c03_1.v, c03_0.v, 0x6 ); tmpc03_2.v = _mm256_blend_pd( c03_2.v, c03_3.v, 0x6 ); tmpc03_3.v = _mm256_blend_pd( c03_3.v, c03_2.v, 0x6 ); tmpc47_0.v = _mm256_blend_pd( c47_0.v, c47_1.v, 0x6 ); tmpc47_1.v = _mm256_blend_pd( c47_1.v, c47_0.v, 0x6 ); tmpc47_2.v = _mm256_blend_pd( c47_2.v, c47_3.v, 0x6 ); tmpc47_3.v = _mm256_blend_pd( c47_3.v, c47_2.v, 0x6 ); c03_0.v = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x30 ); c03_3.v = _mm256_permute2f128_pd( tmpc03_2.v, tmpc03_0.v, 0x30 ); c03_1.v = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x30 ); c03_2.v = _mm256_permute2f128_pd( tmpc03_3.v, tmpc03_1.v, 0x30 ); c47_0.v = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x30 ); c47_3.v = _mm256_permute2f128_pd( tmpc47_2.v, tmpc47_0.v, 0x30 ); c47_1.v = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x30 ); c47_2.v = _mm256_permute2f128_pd( tmpc47_3.v, tmpc47_1.v, 0x30 ); //printf( "rank-k\n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); //for ( i = 0; i < k; i++ ) { // a03.v = _mm256_load_pd( (double*)a ); // a47.v = _mm256_load_pd( (double*)( a + 4 ) ); // b0.v = _mm256_broadcast_sd( (double*)b ); // b1.v = _mm256_broadcast_sd( (double*)( b + 1 ) ); // b2.v = _mm256_broadcast_sd( (double*)( b + 2 ) ); // b3.v = _mm256_broadcast_sd( (double*)( b + 3 ) ); // a += DKS_MR; // b += DKS_NR; // c_tmp.v = _mm256_mul_pd( a03.v , b0.v ); // c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); // c_tmp.v = _mm256_mul_pd( a03.v , b1.v ); // c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); // c_tmp.v = _mm256_mul_pd( a03.v , b2.v ); // c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); // c_tmp.v = _mm256_mul_pd( a03.v , b3.v ); // c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b0.v ); // c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b1.v ); // c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b2.v ); // c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b3.v ); // c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); //} aa_tmp.v = _mm256_broadcast_sd( &neg2 ); //c03_0.v = _mm256_mul_pd( aa_tmp.v, c03_0.v ); //c03_1.v = _mm256_mul_pd( aa_tmp.v, c03_1.v ); //c03_2.v = _mm256_mul_pd( aa_tmp.v, c03_2.v ); //c03_3.v = _mm256_mul_pd( aa_tmp.v, c03_3.v ); //c47_0.v = _mm256_mul_pd( aa_tmp.v, c47_0.v ); //c47_1.v = _mm256_mul_pd( aa_tmp.v, c47_1.v ); //c47_2.v = _mm256_mul_pd( aa_tmp.v, c47_2.v ); //c47_3.v = _mm256_mul_pd( aa_tmp.v, c47_3.v ); // c03_0.v = _mm256_mul_pd( aa_tmp.v, c03_0.v ); c03_1.v = _mm256_mul_pd( aa_tmp.v, c03_1.v ); c03_2.v = _mm256_mul_pd( aa_tmp.v, c03_2.v ); c03_3.v = _mm256_mul_pd( aa_tmp.v, c03_3.v ); c47_0.v = _mm256_mul_pd( aa_tmp.v, c47_0.v ); c47_1.v = _mm256_mul_pd( aa_tmp.v, c47_1.v ); c47_2.v = _mm256_mul_pd( aa_tmp.v, c47_2.v ); c47_3.v = _mm256_mul_pd( aa_tmp.v, c47_3.v ); //printf( "scale -2 \n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); aa_tmp.v = _mm256_load_pd( (double*)aa ); c03_0.v = _mm256_add_pd( aa_tmp.v, c03_0.v ); c03_1.v = _mm256_add_pd( aa_tmp.v, c03_1.v ); c03_2.v = _mm256_add_pd( aa_tmp.v, c03_2.v ); c03_3.v = _mm256_add_pd( aa_tmp.v, c03_3.v ); //printf( "aa03 = %lf, %lf, %lf, %lf\n", aa_tmp.d[0], aa_tmp.d[1], aa_tmp.d[2], aa_tmp.d[3] ); //printf( "bb03 = %lf, %lf, %lf, %lf\n", bb[ 0 ], bb[ 1 ], bb[ 2 ], bb[ 3 ] ); aa_tmp.v = _mm256_load_pd( (double*)( aa + 4 ) ); c47_0.v = _mm256_add_pd( aa_tmp.v, c47_0.v ); c47_1.v = _mm256_add_pd( aa_tmp.v, c47_1.v ); c47_2.v = _mm256_add_pd( aa_tmp.v, c47_2.v ); c47_3.v = _mm256_add_pd( aa_tmp.v, c47_3.v ); //printf( "add a^2\n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); // Prefetch u __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( u ) ); bb_tmp.v = _mm256_broadcast_sd( (double*)bb ); c03_0.v = _mm256_add_pd( bb_tmp.v, c03_0.v ); c47_0.v = _mm256_add_pd( bb_tmp.v, c47_0.v ); bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 1 ) ); c03_1.v = _mm256_add_pd( bb_tmp.v, c03_1.v ); c47_1.v = _mm256_add_pd( bb_tmp.v, c47_1.v ); bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 2 ) ); c03_2.v = _mm256_add_pd( bb_tmp.v, c03_2.v ); c47_2.v = _mm256_add_pd( bb_tmp.v, c47_2.v ); bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 3 ) ); c03_3.v = _mm256_add_pd( bb_tmp.v, c03_3.v ); c47_3.v = _mm256_add_pd( bb_tmp.v, c47_3.v ); // Check if there is any illegle value c_tmp.v = _mm256_broadcast_sd( &dzero ); c03_0.v = _mm256_max_pd( c_tmp.v, c03_0.v ); c03_1.v = _mm256_max_pd( c_tmp.v, c03_1.v ); c03_2.v = _mm256_max_pd( c_tmp.v, c03_2.v ); c03_3.v = _mm256_max_pd( c_tmp.v, c03_3.v ); c47_0.v = _mm256_max_pd( c_tmp.v, c47_0.v ); c47_1.v = _mm256_max_pd( c_tmp.v, c47_1.v ); c47_2.v = _mm256_max_pd( c_tmp.v, c47_2.v ); c47_3.v = _mm256_max_pd( c_tmp.v, c47_3.v ); // Scale before the kernel evaluation aa_tmp.v = _mm256_broadcast_sd( &alpha ); c03_0.v = _mm256_mul_pd( aa_tmp.v, c03_0.v ); c03_1.v = _mm256_mul_pd( aa_tmp.v, c03_1.v ); c03_2.v = _mm256_mul_pd( aa_tmp.v, c03_2.v ); c03_3.v = _mm256_mul_pd( aa_tmp.v, c03_3.v ); c47_0.v = _mm256_mul_pd( aa_tmp.v, c47_0.v ); c47_1.v = _mm256_mul_pd( aa_tmp.v, c47_1.v ); c47_2.v = _mm256_mul_pd( aa_tmp.v, c47_2.v ); c47_3.v = _mm256_mul_pd( aa_tmp.v, c47_3.v ); // Preload u03, u47 u03.v = _mm256_load_pd( (double*)u ); u47.v = _mm256_load_pd( (double*)( u + 4 ) ); // Prefetch w __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( w ) ); #include "ks_exp_int_d8x4.h" //printf( "square distance\n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); //for ( i = 0; i < 4; i++ ) { // if ( c03_0.d[ i ] != c03_0.d[ i ] ) { // printf( "error Nan: c03_0[ %d ]\n", i ); // } // if ( c03_1.d[ i ] != c03_1.d[ i ] ) { // printf( "error Nan: c03_1[ %d ]\n", i ); // } // if ( c03_2.d[ i ] != c03_2.d[ i ] ) { // printf( "error Nan: c03_2[ %d ]\n", i ); // } // if ( c03_3.d[ i ] != c03_3.d[ i ] ) { // printf( "error Nan: c03_3[ %d ]\n", i ); // } // if ( c47_0.d[ i ] != c47_0.d[ i ] ) { // printf( "error Nan: c47_0[ %d ]\n", i ); // } // if ( c47_1.d[ i ] != c47_1.d[ i ] ) { // printf( "error Nan: c47_1[ %d ]\n", i ); // } // if ( c47_2.d[ i ] != c47_2.d[ i ] ) { // printf( "error Nan: c47_2[ %d ]\n", i ); // } // if ( c47_3.d[ i ] != c47_3.d[ i ] ) { // printf( "error Nan: c47_3[ %d ]\n", i ); // } //} // tmp.v = _mm256_broadcast_sd( &maxlog ); // c03_0.v = _mm256_min_pd( tmp.v, c03_0.v ); // c03_1.v = _mm256_min_pd( tmp.v, c03_1.v ); // c03_2.v = _mm256_min_pd( tmp.v, c03_2.v ); // c03_3.v = _mm256_min_pd( tmp.v, c03_3.v ); // c47_0.v = _mm256_min_pd( tmp.v, c47_0.v ); // c47_1.v = _mm256_min_pd( tmp.v, c47_1.v ); // c47_2.v = _mm256_min_pd( tmp.v, c47_2.v ); // c47_3.v = _mm256_min_pd( tmp.v, c47_3.v ); // tmp.v = _mm256_broadcast_sd( &minlog ); // c03_0.v = _mm256_max_pd( tmp.v, c03_0.v ); // c03_1.v = _mm256_max_pd( tmp.v, c03_1.v ); // c03_2.v = _mm256_max_pd( tmp.v, c03_2.v ); // c03_3.v = _mm256_max_pd( tmp.v, c03_3.v ); // c47_0.v = _mm256_max_pd( tmp.v, c47_0.v ); // c47_1.v = _mm256_max_pd( tmp.v, c47_1.v ); // c47_2.v = _mm256_max_pd( tmp.v, c47_2.v ); // c47_3.v = _mm256_max_pd( tmp.v, c47_3.v ); // // // a = c / log2e // // c = a * ln2 = k * ln2 + w, ( w in [ -ln2, ln2 ] ) // l2e.v = _mm256_broadcast_sd( &log2e ); // a03_0.v = _mm256_mul_pd( l2e.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( l2e.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( l2e.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( l2e.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( l2e.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( l2e.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( l2e.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( l2e.v, c47_3.v ); // // // Check if a < 0 // tmp.v = _mm256_setzero_pd(); // p03_0.v = _mm256_cmp_pd( a03_0.v, tmp.v, 1 ); // p03_1.v = _mm256_cmp_pd( a03_1.v, tmp.v, 1 ); // p03_2.v = _mm256_cmp_pd( a03_2.v, tmp.v, 1 ); // p03_3.v = _mm256_cmp_pd( a03_3.v, tmp.v, 1 ); // p47_0.v = _mm256_cmp_pd( a47_0.v, tmp.v, 1 ); // p47_1.v = _mm256_cmp_pd( a47_1.v, tmp.v, 1 ); // p47_2.v = _mm256_cmp_pd( a47_2.v, tmp.v, 1 ); // p47_3.v = _mm256_cmp_pd( a47_3.v, tmp.v, 1 ); // tmp.v = _mm256_broadcast_sd( &one ); // p03_0.v = _mm256_and_pd( tmp.v, p03_0.v ); // p03_1.v = _mm256_and_pd( tmp.v, p03_1.v ); // p03_2.v = _mm256_and_pd( tmp.v, p03_2.v ); // p03_3.v = _mm256_and_pd( tmp.v, p03_3.v ); // p47_0.v = _mm256_and_pd( tmp.v, p47_0.v ); // p47_1.v = _mm256_and_pd( tmp.v, p47_1.v ); // p47_2.v = _mm256_and_pd( tmp.v, p47_2.v ); // p47_3.v = _mm256_and_pd( tmp.v, p47_3.v ); // // If a < 0 ( w < 0 ), then a - 1 = ( k - 1 ) + w / ln2 // a03_0.v = _mm256_sub_pd( a03_0.v, p03_0.v ); // a03_1.v = _mm256_sub_pd( a03_1.v, p03_1.v ); // a03_2.v = _mm256_sub_pd( a03_2.v, p03_2.v ); // a03_3.v = _mm256_sub_pd( a03_3.v, p03_3.v ); // a47_0.v = _mm256_sub_pd( a47_0.v, p47_0.v ); // a47_1.v = _mm256_sub_pd( a47_1.v, p47_1.v ); // a47_2.v = _mm256_sub_pd( a47_2.v, p47_2.v ); // a47_3.v = _mm256_sub_pd( a47_3.v, p47_3.v ); // // Compute floor( a ) by two conversions // // if a < 0, p = k - 1 // // else , p = k // k03_0.v = _mm256_cvttpd_epi32( a03_0.v ); // k03_1.v = _mm256_cvttpd_epi32( a03_1.v ); // k03_2.v = _mm256_cvttpd_epi32( a03_2.v ); // k03_3.v = _mm256_cvttpd_epi32( a03_3.v ); // k47_0.v = _mm256_cvttpd_epi32( a47_0.v ); // k47_1.v = _mm256_cvttpd_epi32( a47_1.v ); // k47_2.v = _mm256_cvttpd_epi32( a47_2.v ); // k47_3.v = _mm256_cvttpd_epi32( a47_3.v ); // p03_0.v = _mm256_cvtepi32_pd( k03_0.v ); // p03_1.v = _mm256_cvtepi32_pd( k03_1.v ); // p03_2.v = _mm256_cvtepi32_pd( k03_2.v ); // p03_3.v = _mm256_cvtepi32_pd( k03_3.v ); // p47_0.v = _mm256_cvtepi32_pd( k47_0.v ); // p47_1.v = _mm256_cvtepi32_pd( k47_1.v ); // p47_2.v = _mm256_cvtepi32_pd( k47_2.v ); // p47_3.v = _mm256_cvtepi32_pd( k47_3.v ); // // // --------------------- // // x -= p * ln2 // // --------------------- // // c1 = ln2 // // if a < 0, a = ( k - 1 ) * ln2 // // else , a = k * ln2 // // if a < 0, x -= ( k - 1 ) * ln2 // // else , x -= k * ln2 // // // tmp.v = _mm256_broadcast_sd( &c1 ); // a03_0.v = _mm256_mul_pd( tmp.v, p03_0.v ); // a03_1.v = _mm256_mul_pd( tmp.v, p03_1.v ); // a03_2.v = _mm256_mul_pd( tmp.v, p03_2.v ); // a03_3.v = _mm256_mul_pd( tmp.v, p03_3.v ); // a47_0.v = _mm256_mul_pd( tmp.v, p47_0.v ); // a47_1.v = _mm256_mul_pd( tmp.v, p47_1.v ); // a47_2.v = _mm256_mul_pd( tmp.v, p47_2.v ); // a47_3.v = _mm256_mul_pd( tmp.v, p47_3.v ); // c03_0.v = _mm256_sub_pd( c03_0.v, a03_0.v ); // c03_1.v = _mm256_sub_pd( c03_1.v, a03_1.v ); // c03_2.v = _mm256_sub_pd( c03_2.v, a03_2.v ); // c03_3.v = _mm256_sub_pd( c03_3.v, a03_3.v ); // c47_0.v = _mm256_sub_pd( c47_0.v, a47_0.v ); // c47_1.v = _mm256_sub_pd( c47_1.v, a47_1.v ); // c47_2.v = _mm256_sub_pd( c47_2.v, a47_2.v ); // c47_3.v = _mm256_sub_pd( c47_3.v, a47_3.v ); // tmp.v = _mm256_broadcast_sd( &c2 ); // a03_0.v = _mm256_mul_pd( tmp.v, p03_0.v ); // a03_1.v = _mm256_mul_pd( tmp.v, p03_1.v ); // a03_2.v = _mm256_mul_pd( tmp.v, p03_2.v ); // a03_3.v = _mm256_mul_pd( tmp.v, p03_3.v ); // a47_0.v = _mm256_mul_pd( tmp.v, p47_0.v ); // a47_1.v = _mm256_mul_pd( tmp.v, p47_1.v ); // a47_2.v = _mm256_mul_pd( tmp.v, p47_2.v ); // a47_3.v = _mm256_mul_pd( tmp.v, p47_3.v ); // c03_0.v = _mm256_sub_pd( c03_0.v, a03_0.v ); // c03_1.v = _mm256_sub_pd( c03_1.v, a03_1.v ); // c03_2.v = _mm256_sub_pd( c03_2.v, a03_2.v ); // c03_3.v = _mm256_sub_pd( c03_3.v, a03_3.v ); // c47_0.v = _mm256_sub_pd( c47_0.v, a47_0.v ); // c47_1.v = _mm256_sub_pd( c47_1.v, a47_1.v ); // c47_2.v = _mm256_sub_pd( c47_2.v, a47_2.v ); // c47_3.v = _mm256_sub_pd( c47_3.v, a47_3.v ); // // // //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); // //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); // //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); // //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); // //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); // //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); // //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); // //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); // // // // Prefetch u // __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( u ) ); // // // // // Compute e^x using polynomial approximation // // a = w10 + w11 * x // tmp.v = _mm256_broadcast_sd( &w11 ); // //tmp.v = _mm256_broadcast_sd( &w9 ); // a03_0.v = _mm256_mul_pd( c03_0.v, tmp.v ); // a03_1.v = _mm256_mul_pd( c03_1.v, tmp.v ); // a03_2.v = _mm256_mul_pd( c03_2.v, tmp.v ); // a03_3.v = _mm256_mul_pd( c03_3.v, tmp.v ); // a47_0.v = _mm256_mul_pd( c47_0.v, tmp.v ); // a47_1.v = _mm256_mul_pd( c47_1.v, tmp.v ); // a47_2.v = _mm256_mul_pd( c47_2.v, tmp.v ); // a47_3.v = _mm256_mul_pd( c47_3.v, tmp.v ); // tmp.v = _mm256_broadcast_sd( &w10 ); // //tmp.v = _mm256_broadcast_sd( &w8 ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // // // // a = w8 + ( w9 + ( w10 + w11 * x ) * x ) * x // tmp.v = _mm256_broadcast_sd( &w9 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // tmp.v = _mm256_broadcast_sd( &w8 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // // // tmp.v = _mm256_broadcast_sd( &w7 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // tmp.v = _mm256_broadcast_sd( &w6 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // // // tmp.v = _mm256_broadcast_sd( &w5 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // tmp.v = _mm256_broadcast_sd( &w4 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // // // // Prefetch w // __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( w ) ); // // Preload u03 // u03.v = _mm256_load_pd( (double*)u ); // // // tmp.v = _mm256_broadcast_sd( &w3 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // tmp.v = _mm256_broadcast_sd( &w2 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // // // tmp.v = _mm256_broadcast_sd( &w1 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // tmp.v = _mm256_broadcast_sd( &w0 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // // // // Preload u47 // u47.v = _mm256_load_pd( (double*)( u + 4 ) ); // // // offset.v = _mm_setr_epi32( 1023, 1023, 0, 0 ); // k1.v = _mm_set_epi32( 0, 0, k03_0.d[ 1 ], k03_0.d[ 0 ]); // k2.v = _mm_set_epi32( 0, 0, k03_0.d[ 3 ], k03_0.d[ 2 ]); // k1.v = _mm_add_epi32( k1.v, offset.v ); // k2.v = _mm_add_epi32( k2.v, offset.v ); // k1.v = _mm_slli_epi32( k1.v, 20 ); // k2.v = _mm_slli_epi32( k2.v, 20 ); // k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // p1 = _mm_castsi128_pd( k1.v ); // p2 = _mm_castsi128_pd( k2.v ); // p03_0.v = _mm256_set_m128d( p2, p1 ); // k1.v = _mm_set_epi32( 0, 0, k03_1.d[ 1 ], k03_1.d[ 0 ]); // k2.v = _mm_set_epi32( 0, 0, k03_1.d[ 3 ], k03_1.d[ 2 ]); // k1.v = _mm_add_epi32( k1.v, offset.v ); // k2.v = _mm_add_epi32( k2.v, offset.v ); // k1.v = _mm_slli_epi32( k1.v, 20 ); // k2.v = _mm_slli_epi32( k2.v, 20 ); // k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // p1 = _mm_castsi128_pd( k1.v ); // p2 = _mm_castsi128_pd( k2.v ); // p03_1.v = _mm256_set_m128d( p2, p1 ); // k1.v = _mm_set_epi32( 0, 0, k03_2.d[ 1 ], k03_2.d[ 0 ]); // k2.v = _mm_set_epi32( 0, 0, k03_2.d[ 3 ], k03_2.d[ 2 ]); // k1.v = _mm_add_epi32( k1.v, offset.v ); // k2.v = _mm_add_epi32( k2.v, offset.v ); // k1.v = _mm_slli_epi32( k1.v, 20 ); // k2.v = _mm_slli_epi32( k2.v, 20 ); // k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // p1 = _mm_castsi128_pd( k1.v ); // p2 = _mm_castsi128_pd( k2.v ); // p03_2.v = _mm256_set_m128d( p2, p1 ); // k1.v = _mm_set_epi32( 0, 0, k03_3.d[ 1 ], k03_3.d[ 0 ]); // k2.v = _mm_set_epi32( 0, 0, k03_3.d[ 3 ], k03_3.d[ 2 ]); // k1.v = _mm_add_epi32( k1.v, offset.v ); // k2.v = _mm_add_epi32( k2.v, offset.v ); // k1.v = _mm_slli_epi32( k1.v, 20 ); // k2.v = _mm_slli_epi32( k2.v, 20 ); // k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // p1 = _mm_castsi128_pd( k1.v ); // p2 = _mm_castsi128_pd( k2.v ); // p03_3.v = _mm256_set_m128d( p2, p1 ); // k1.v = _mm_set_epi32( 0, 0, k47_0.d[ 1 ], k47_0.d[ 0 ]); // k2.v = _mm_set_epi32( 0, 0, k47_0.d[ 3 ], k47_0.d[ 2 ]); // k1.v = _mm_add_epi32( k1.v, offset.v ); // k2.v = _mm_add_epi32( k2.v, offset.v ); // k1.v = _mm_slli_epi32( k1.v, 20 ); // k2.v = _mm_slli_epi32( k2.v, 20 ); // k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // p1 = _mm_castsi128_pd( k1.v ); // p2 = _mm_castsi128_pd( k2.v ); // p47_0.v = _mm256_set_m128d( p2, p1 ); // k1.v = _mm_set_epi32( 0, 0, k47_1.d[ 1 ], k47_1.d[ 0 ]); // k2.v = _mm_set_epi32( 0, 0, k47_1.d[ 3 ], k47_1.d[ 2 ]); // k1.v = _mm_add_epi32( k1.v, offset.v ); // k2.v = _mm_add_epi32( k2.v, offset.v ); // k1.v = _mm_slli_epi32( k1.v, 20 ); // k2.v = _mm_slli_epi32( k2.v, 20 ); // k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // p1 = _mm_castsi128_pd( k1.v ); // p2 = _mm_castsi128_pd( k2.v ); // p47_1.v = _mm256_set_m128d( p2, p1 ); // k1.v = _mm_set_epi32( 0, 0, k47_2.d[ 1 ], k47_2.d[ 0 ]); // k2.v = _mm_set_epi32( 0, 0, k47_2.d[ 3 ], k47_2.d[ 2 ]); // k1.v = _mm_add_epi32( k1.v, offset.v ); // k2.v = _mm_add_epi32( k2.v, offset.v ); // k1.v = _mm_slli_epi32( k1.v, 20 ); // k2.v = _mm_slli_epi32( k2.v, 20 ); // k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // p1 = _mm_castsi128_pd( k1.v ); // p2 = _mm_castsi128_pd( k2.v ); // p47_2.v = _mm256_set_m128d( p2, p1 ); // k1.v = _mm_set_epi32( 0, 0, k47_3.d[ 1 ], k47_3.d[ 0 ]); // k2.v = _mm_set_epi32( 0, 0, k47_3.d[ 3 ], k47_3.d[ 2 ]); // k1.v = _mm_add_epi32( k1.v, offset.v ); // k2.v = _mm_add_epi32( k2.v, offset.v ); // k1.v = _mm_slli_epi32( k1.v, 20 ); // k2.v = _mm_slli_epi32( k2.v, 20 ); // k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // p1 = _mm_castsi128_pd( k1.v ); // p2 = _mm_castsi128_pd( k2.v ); // p47_3.v = _mm256_set_m128d( p2, p1 ); // // // //u03.v = _mm256_load_pd( (double*)u ); // //u47.v = _mm256_load_pd( (double*)( u + 4 ) ); // // // c03_0.v = _mm256_mul_pd( a03_0.v, p03_0.v ); // c03_1.v = _mm256_mul_pd( a03_1.v, p03_1.v ); // c03_2.v = _mm256_mul_pd( a03_2.v, p03_2.v ); // c03_3.v = _mm256_mul_pd( a03_3.v, p03_3.v ); // c47_0.v = _mm256_mul_pd( a47_0.v, p47_0.v ); // c47_1.v = _mm256_mul_pd( a47_1.v, p47_1.v ); // c47_2.v = _mm256_mul_pd( a47_2.v, p47_2.v ); // c47_3.v = _mm256_mul_pd( a47_3.v, p47_3.v ); //for ( i = 0; i < 4; i++ ) { // if ( c03_0.d[ i ] != c03_0.d[ i ] ) { // printf( "error exp Nan: c03_0[ %d ]\n", i ); // } // if ( c03_1.d[ i ] != c03_1.d[ i ] ) { // printf( "error exp Nan: c03_1[ %d ]\n", i ); // } // if ( c03_2.d[ i ] != c03_2.d[ i ] ) { // printf( "error exp Nan: c03_2[ %d ]\n", i ); // } // if ( c03_3.d[ i ] != c03_3.d[ i ] ) { // printf( "error exp Nan: c03_3[ %d ]\n", i ); // } // if ( c47_0.d[ i ] != c47_0.d[ i ] ) { // printf( "error exp Nan: c47_0[ %d ]\n", i ); // } // if ( c47_1.d[ i ] != c47_1.d[ i ] ) { // printf( "error exp Nan: c47_1[ %d ]\n", i ); // } // if ( c47_2.d[ i ] != c47_2.d[ i ] ) { // printf( "error exp Nan: c47_2[ %d ]\n", i ); // } // if ( c47_3.d[ i ] != c47_3.d[ i ] ) { // printf( "error exp Nan: c47_3[ %d ]\n", i ); // } //} //printf( "exp\n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); //printf( "w\n" ); //printf( "%lf, %lf, %lf, %lf\n", w[0], w[3], w[3], w[3] ); //u03.v = _mm256_load_pd( (double*)u ); //u47.v = _mm256_load_pd( (double*)( u + 4 ) ); w_tmp.v = _mm256_broadcast_sd( (double*)w ); c03_0.v = _mm256_mul_pd( w_tmp.v, c03_0.v ); c47_0.v = _mm256_mul_pd( w_tmp.v, c47_0.v ); u03.v = _mm256_add_pd( u03.v, c03_0.v ); u47.v = _mm256_add_pd( u47.v, c47_0.v ); //for ( i = 0; i < 4; i++ ) { // if ( w_tmp.d[ i ] != w_tmp.d[ i ] ) { // printf( "error w_tmp Nan: w_tmp[ %d ]\n", i ); // } //} w_tmp.v = _mm256_broadcast_sd( (double*)( w + 1 ) ); c03_1.v = _mm256_mul_pd( w_tmp.v, c03_1.v ); c47_1.v = _mm256_mul_pd( w_tmp.v, c47_1.v ); u03.v = _mm256_add_pd( u03.v, c03_1.v ); u47.v = _mm256_add_pd( u47.v, c47_1.v ); //for ( i = 0; i < 4; i++ ) { // if ( w_tmp.d[ i ] != w_tmp.d[ i ] ) { // printf( "error w_tmp Nan: w_tmp[ %d ]\n", i ); // } //} w_tmp.v = _mm256_broadcast_sd( (double*)( w + 2 ) ); c03_2.v = _mm256_mul_pd( w_tmp.v, c03_2.v ); c47_2.v = _mm256_mul_pd( w_tmp.v, c47_2.v ); u03.v = _mm256_add_pd( u03.v, c03_2.v ); u47.v = _mm256_add_pd( u47.v, c47_2.v ); //for ( i = 0; i < 4; i++ ) { // if ( w_tmp.d[ i ] != w_tmp.d[ i ] ) { // printf( "error w_tmp Nan: w_tmp[ %d ]\n", i ); // } //} w_tmp.v = _mm256_broadcast_sd( (double*)( w + 3 ) ); c03_3.v = _mm256_mul_pd( w_tmp.v, c03_3.v ); c47_3.v = _mm256_mul_pd( w_tmp.v, c47_3.v ); u03.v = _mm256_add_pd( u03.v, c03_3.v ); u47.v = _mm256_add_pd( u47.v, c47_3.v ); //for ( i = 0; i < 4; i++ ) { // if ( w_tmp.d[ i ] != w_tmp.d[ i ] ) { // printf( "error w_tmp Nan: w_tmp[ %d ]\n", i ); // } //} _mm256_store_pd( (double*)u, u03.v ); _mm256_store_pd( (double*)( u + 4 ), u47.v ); //for ( i = 0; i < 4; i++ ) { // if ( c03_0.d[ i ] != c03_0.d[ i ] ) { // printf( "error gemv Nan: c03_0[ %d ]\n", i ); // exit( 1 ); // } // if ( c03_1.d[ i ] != c03_1.d[ i ] ) { // printf( "error gemv Nan: c03_1[ %d ]\n", i ); // exit( 1 ); // } // if ( c03_2.d[ i ] != c03_2.d[ i ] ) { // printf( "error gemv Nan: c03_2[ %d ]\n", i ); // exit( 1 ); // } // if ( c03_3.d[ i ] != c03_3.d[ i ] ) { // printf( "error gemv Nan: c03_3[ %d ]\n", i ); // exit( 1 ); // } // if ( c47_0.d[ i ] != c47_0.d[ i ] ) { // printf( "error gemv Nan: c47_0[ %d ]\n", i ); // exit( 1 ); // } // if ( c47_1.d[ i ] != c47_1.d[ i ] ) { // printf( "error gemv Nan: c47_1[ %d ]\n", i ); // exit( 1 ); // } // if ( c47_2.d[ i ] != c47_2.d[ i ] ) { // printf( "error gemv Nan: c47_2[ %d ]\n", i ); // exit( 1 ); // } // if ( c47_3.d[ i ] != c47_3.d[ i ] ) { // printf( "error gemv Nan: c47_3[ %d ]\n", i ); // exit( 1 ); // } //} //for ( i = 0; i < 4; i ++ ) { // if ( w[ i ] != w[ i ] ) { // printf( "GSKS error w Nan: w03[ %d ]\n", i ); // } //} //for ( i = 0; i < 4; i++ ) { // if ( u03.d[ i ] != u03.d[ i ] ) { // printf( "GSKS error u Nan: u03[ %d ]\n", i ); // } // if ( u47.d[ i ] != u47.d[ i ] ) { // printf( "GSKS error u Nan: u47[ %d ]\n", i ); // } //} //printf( "%lf\n", u03.d[0] ); //printf( "%lf\n", u03.d[1] ); //printf( "%lf\n", u03.d[2] ); //printf( "%lf\n", u03.d[3] ); //printf( "%lf\n", u47.d[0] ); //printf( "%lf\n", u47.d[1] ); //printf( "%lf\n", u47.d[2] ); //printf( "%lf\n", u47.d[3] ); }