BOOST_FORCEINLINE __m256d shuffle(__m256d const lower, __m256d const upper) { return _mm256_shuffle_pd( lower, upper , BOOST_SIMD_MM_SHUFFLE_PD(upper_i1, upper_i0, lower_i1, lower_i0) ); }
BOOST_FORCEINLINE __m256i shuffle(__m256i const lower, __m256i const upper) { return _mm256_castpd_si256( _mm256_shuffle_pd( _mm256_castsi256_pd(lower), _mm256_castsi256_pd(upper) , _MM_SHUFFLE(upper_i1, upper_i0, lower_i1, lower_i0) ) ); }
BOOST_FORCEINLINE __m256 shuffle_pairs(__m256 const lower, __m256 const upper) { return _mm256_castpd_ps( _mm256_shuffle_pd( _mm256_castps_pd(lower), _mm256_castps_pd(upper) , BOOST_SIMD_MM_SHUFFLE_PD(i3, i2, i1, i0) ) ); }
irreg_poly_area_func_sign(double, _avx) { if (__builtin_expect(is_null(cords) || cords_len == 0, 0)) return 0; __m256d curr, forw, coef_0, coef_1, end = _mm256_load_pd((const double *)cords), accum_sum = _mm256_setzero_pd(); double accum_sum_aux; unsigned long index; for (index = 0; index < (cords_len - 4); index += 4) { curr = end; // x0,y0,x1,y1 forw = _mm256_load_pd((const double *)&cords[index + 2]); // x2,y2,x3,y3 end = _mm256_load_pd((const double *)&cords[index + 4]); // x4,y4,x5,y5 coef_0 = _mm256_permute2f128_pd(curr, forw, 0b00110001); // x1, y1, x3, y3 coef_1 = _mm256_permute2f128_pd(forw, end, 0b00100000); // x2, y2, x4, y4 //_mm256_hsub_pd(a, b) == a0 - a1, b0 - b1, a2 - a3, b2 - b3 accum_sum = _mm256_add_pd( accum_sum, _mm256_hsub_pd( // x0*y1 - y0*x1, x1*y2 - y1x2, x2*y3 - y2*x3, x3*y4 - y3*x4 _mm256_mul_pd( // x0*y1, y0*x1, x2*y3, y2*x3 _mm256_permute2f128_pd(curr, forw, 0b00100000), // x0, y0, x2, y2 _mm256_shuffle_pd(coef_0, coef_0, 0b0101) // y1, x1, y3, x3 ), _mm256_mul_pd(coef_0, _mm256_shuffle_pd(coef_1, coef_1, 0b0101)) // y2, x2, y4, x4 // ^^^^^^^^^^^^^^^ x1*y2, y1*x2, x3*y4, y3*x4 ) ); } accum_sum = _mm256_hadd_pd(accum_sum, _mm256_permute2f128_pd(accum_sum, accum_sum, 1)); // a0+a1, a2+a3, a2+a3, a0+a1 accum_sum = _mm256_hadd_pd(accum_sum, accum_sum); // a0+a1+a2+a3, ... for (accum_sum_aux = _mm_cvtsd_f64(_mm256_castpd256_pd128(accum_sum)); index < (cords_len - 1); index++) accum_sum_aux += _calc_diff_of_adj_prods(cords, index); return accum_sum_aux; // return scalar_half(scalar_abs(accum_sum_aux)); }
void ComplexToDouble(Complex *src, double *dstI, double *dstQ, const unsigned int len) { __m128 avxA, avxB; __m256d avxA_D, avxB_D, avxX_D, avxY_D, avxR_D, avxI_D; for (unsigned int i=0; i+4<=len; i+=4) { avxA = _mm_maskload_ps((float*)(src+i), _mm_set_epi32(SET_1, SET_1, SET_1, SET_1)); //load float avxB = _mm_maskload_ps((float*)(src+i+2), _mm_set_epi32(SET_1, SET_1, SET_1, SET_1)); avxA_D = _mm256_cvtps_pd(avxA); //float to double avxB_D = _mm256_cvtps_pd(avxB); avxX_D = _mm256_permute2f128_pd(avxA_D, avxB_D, 0x20); avxY_D = _mm256_permute2f128_pd(avxA_D, avxB_D, 0x31); avxR_D = _mm256_shuffle_pd(avxX_D, avxY_D, 0x00); avxI_D = _mm256_shuffle_pd(avxX_D, avxY_D, 0x0f); _mm256_storeu_pd(dstI+i, avxR_D); //store _mm256_storeu_pd(dstQ+i, avxI_D); } for (unsigned int i=len-(len&0x03); i<len; ++i) { dstI[i] = static_cast<double>(src[i].m_real); dstQ[i] = static_cast<double>(src[i].m_imag); } }
void nibble_sort_beekman1(uint64_t *buf) { // already in the right order //__m256i // shuf0={0x1716151413121110ULL,0x1f1e1d1c1b1a1918ULL,0x0706050403020100ULL,0x0f0e0d0c0b0a0908ULL}; __m256i shuf1 = {0x1e161c141a121810ULL, 0x1f171d151b131911ULL, 0x0e060c040a020800ULL, 0x0f070d050b030901ULL}; __m256i shuf2 = {0x1d1c151419181110ULL, 0x1f1e17161b1a1312ULL, 0x0d0c050409080100ULL, 0x0f0e07060b0a0302ULL}; // use less instructions below //__m256i // shuf3={0x1b1a191813121110ULL,0x1f1e1d1c17161514ULL,0x0b0a090803020100ULL,0x0f0e0d0c07060504ULL}; __m256i shuf4 = {0x101d171615141311ULL, 0x1f1e1b191a181c12ULL, 0x000d070605040301ULL, 0x0f0e0b090a080c02ULL}; __m256i shuf5 = {0x171d151413111810ULL, 0x1f1e16191c1b1a12ULL, 0x070d050403010800ULL, 0x0f0e06090c0b0a02ULL}; __m256i shuf6 = {0x1e17161a15141211ULL, 0x1f101d1c1b191318ULL, 0x0e07060a05040201ULL, 0x0f000d0c0b090308ULL}; __m256i shuf7 = {0x171510161b131911ULL, 0x1f1d181e1c141a12ULL, 0x070500060b030901ULL, 0x0f0d080e0c040a02ULL}; __m256i shuf8 = {0x1715141613121110ULL, 0x1f1e1c1b1a19181dULL, 0x0705040603020100ULL, 0x0f0e0c0b0a09080dULL}; __m256i shuf9 = {0x171c1b1a19181615ULL, 0x1f1e14131211101dULL, 0x070c0b0a09080605ULL, 0x0f0e04030201000dULL}; __m256i nibblemask = _mm256_set1_epi8(0x0f); for (uint32_t i = 0; i < (1024 / 4); i += 1) { __m256i r0 = _mm256_loadu_si256(((__m256i *)buf) + i), r1 = r0, r2; r0 &= nibblemask; r1 ^= r0; r1 = _mm256_srli_epi64(r1, 4); #define sort_and_shuffle(n) \ r2 = _mm256_max_epi8(r0, r1); \ r0 = _mm256_min_epi8(r0, r1); \ r1 = (__m256i)_mm256_shuffle_pd((__m256d)r0, (__m256d)r2, 0b0000); \ r2 = (__m256i)_mm256_shuffle_pd((__m256d)r0, (__m256d)r2, 0b1111); \ r1 = _mm256_shuffle_epi8(r1, shuf##n); \ r2 = _mm256_shuffle_epi8(r2, shuf##n); \ r0 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b0000); \ r1 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b1111) sort_and_shuffle(1); sort_and_shuffle(2); { // sort_and_shuffle(3); r2 = _mm256_max_epi8(r0, r1); r0 = _mm256_min_epi8(r0, r1); r1 = (__m256i)_mm256_unpacklo_ps((__m256)r0, (__m256)r2); r2 = (__m256i)_mm256_unpackhi_ps((__m256)r0, (__m256)r2); r0 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b1111); r1 = (__m256i)_mm256_shuffle_pd((__m256d)r1, (__m256d)r2, 0b0000); } sort_and_shuffle(4); sort_and_shuffle(5); sort_and_shuffle(6); sort_and_shuffle(7); sort_and_shuffle(8); sort_and_shuffle(9); r1 = _mm256_slli_epi64(r1, 4); _mm256_storeu_si256(((__m256i *)buf) + i, r1 | r0); } }
void rnn_int_d8x4_var2( int k, double *aa, double *a, double *bb, double *b, double *c, aux_t *aux ) { int i; double neg2 = -2.0; double dzero = 0.0; v4df_t c03_0, c03_1, c03_2, c03_3; v4df_t c47_0, c47_1, c47_2, c47_3; v4df_t tmpc03_0, tmpc03_1, tmpc03_2, tmpc03_3; v4df_t tmpc47_0, tmpc47_1, tmpc47_2, tmpc47_3; v4df_t c_tmp; v4df_t a03, a47; v4df_t A03, A47; // prefetched A v4df_t b0, b1, b2, b3; v4df_t B0; // prefetched B v4df_t aa_tmp, bb_tmp; int k_iter = k / 2; int k_left = k % 2; __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( a ) ); __asm__ volatile( "prefetcht2 0(%0) \n\t" : :"r"( aux->b_next ) ); __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( c ) ); c03_0.v = _mm256_setzero_pd(); c03_1.v = _mm256_setzero_pd(); c03_2.v = _mm256_setzero_pd(); c03_3.v = _mm256_setzero_pd(); c47_0.v = _mm256_setzero_pd(); c47_1.v = _mm256_setzero_pd(); c47_2.v = _mm256_setzero_pd(); c47_3.v = _mm256_setzero_pd(); // Load a03 a03.v = _mm256_load_pd( (double*)a ); // Load a47 a47.v = _mm256_load_pd( (double*)( a + 4 ) ); // Load (b0,b1,b2,b3) b0.v = _mm256_load_pd( (double*)b ); for ( i = 0; i < k_iter; ++i ) { __asm__ volatile( "prefetcht0 192(%0) \n\t" : :"r"(a) ); // Preload A03 A03.v = _mm256_load_pd( (double*)( a + 8 ) ); c_tmp.v = _mm256_mul_pd( a03.v , b0.v ); c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); c_tmp.v = _mm256_mul_pd( a47.v , b0.v ); c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); // Preload A47 A47.v = _mm256_load_pd( (double*)( a + 12 ) ); // Shuffle b ( 1, 0, 3, 2 ) b1.v = _mm256_shuffle_pd( b0.v, b0.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b1.v ); c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); c_tmp.v = _mm256_mul_pd( a47.v , b1.v ); c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); // Permute b ( 3, 2, 1, 0 ) b2.v = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 ); // Preload B0 B0.v = _mm256_load_pd( (double*)( b + 4 ) ); c_tmp.v = _mm256_mul_pd( a03.v , b2.v ); c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); c_tmp.v = _mm256_mul_pd( a47.v , b2.v ); c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // Shuffle b ( 3, 2, 1, 0 ) b3.v = _mm256_shuffle_pd( b2.v, b2.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b3.v ); c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); c_tmp.v = _mm256_mul_pd( a47.v , b3.v ); c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); // Iteration #1 __asm__ volatile( "prefetcht0 512(%0) \n\t" : :"r"(a) ); // Preload a03 ( next iteration ) a03.v = _mm256_load_pd( (double*)( a + 16 ) ); c_tmp.v = _mm256_mul_pd( A03.v , B0.v ); c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); b1.v = _mm256_shuffle_pd( B0.v, B0.v, 0x5 ); c_tmp.v = _mm256_mul_pd( A47.v , B0.v ); c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); c_tmp.v = _mm256_mul_pd( A03.v , b1.v ); c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); // Preload a47 ( next iteration ) a47.v = _mm256_load_pd( (double*)( a + 20 ) ); // Permute b ( 3, 2, 1, 0 ) b2.v = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 ); c_tmp.v = _mm256_mul_pd( A47.v , b1.v ); c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); c_tmp.v = _mm256_mul_pd( A03.v , b2.v ); c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); // Shuffle b ( 3, 2, 1, 0 ) b3.v = _mm256_shuffle_pd( b2.v, b2.v, 0x5 ); c_tmp.v = _mm256_mul_pd( A47.v , b2.v ); c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // Load b0 ( next iteration ) b0.v = _mm256_load_pd( (double*)( b + 8 ) ); c_tmp.v = _mm256_mul_pd( A03.v , b3.v ); c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); c_tmp.v = _mm256_mul_pd( A47.v , b3.v ); c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); a += 16; b += 8; } for ( i = 0; i < k_left; ++i ) { a03.v = _mm256_load_pd( (double*)a ); //printf( "a03 = %lf, %lf, %lf, %lf\n", a03.d[0], a03.d[1], a03.d[2], a03.d[3] ); a47.v = _mm256_load_pd( (double*)( a + 4 ) ); //printf( "a47 = %lf, %lf, %lf, %lf\n", a47.d[0], a47.d[1], a47.d[2], a47.d[3] ); b0.v = _mm256_load_pd( (double*)b ); //printf( "b0 = %lf, %lf, %lf, %lf\n", b0.d[0], b0.d[1], b0.d[2], b0.d[3] ); c_tmp.v = _mm256_mul_pd( a03.v , b0.v ); c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); c_tmp.v = _mm256_mul_pd( a47.v , b0.v ); c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); // Shuffle b ( 1, 0, 3, 2 ) b1.v = _mm256_shuffle_pd( b0.v, b0.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b1.v ); c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); c_tmp.v = _mm256_mul_pd( a47.v , b1.v ); c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); // Permute b ( 3, 2, 1, 0 ) b2.v = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 ); c_tmp.v = _mm256_mul_pd( a03.v , b2.v ); c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); c_tmp.v = _mm256_mul_pd( a47.v , b2.v ); c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // Shuffle b ( 3, 2, 1, 0 ) b3.v = _mm256_shuffle_pd( b2.v, b2.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b3.v ); c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); c_tmp.v = _mm256_mul_pd( a47.v , b3.v ); c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); a += 8; b += 4; } // Prefetch aa and bb __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( aa ) ); __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( bb ) ); tmpc03_0.v = _mm256_blend_pd( c03_0.v, c03_1.v, 0x6 ); tmpc03_1.v = _mm256_blend_pd( c03_1.v, c03_0.v, 0x6 ); tmpc03_2.v = _mm256_blend_pd( c03_2.v, c03_3.v, 0x6 ); tmpc03_3.v = _mm256_blend_pd( c03_3.v, c03_2.v, 0x6 ); tmpc47_0.v = _mm256_blend_pd( c47_0.v, c47_1.v, 0x6 ); tmpc47_1.v = _mm256_blend_pd( c47_1.v, c47_0.v, 0x6 ); tmpc47_2.v = _mm256_blend_pd( c47_2.v, c47_3.v, 0x6 ); tmpc47_3.v = _mm256_blend_pd( c47_3.v, c47_2.v, 0x6 ); c03_0.v = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x30 ); c03_3.v = _mm256_permute2f128_pd( tmpc03_2.v, tmpc03_0.v, 0x30 ); c03_1.v = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x30 ); c03_2.v = _mm256_permute2f128_pd( tmpc03_3.v, tmpc03_1.v, 0x30 ); c47_0.v = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x30 ); c47_3.v = _mm256_permute2f128_pd( tmpc47_2.v, tmpc47_0.v, 0x30 ); c47_1.v = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x30 ); c47_2.v = _mm256_permute2f128_pd( tmpc47_3.v, tmpc47_1.v, 0x30 ); //printf( "rank-k\n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( aux->I ) ); __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( aux->D ) ); //for ( i = 0; i < k; i++ ) { // a03.v = _mm256_load_pd( (double*)a ); // a47.v = _mm256_load_pd( (double*)( a + 4 ) ); // b0.v = _mm256_broadcast_sd( (double*)b ); // b1.v = _mm256_broadcast_sd( (double*)( b + 1 ) ); // b2.v = _mm256_broadcast_sd( (double*)( b + 2 ) ); // b3.v = _mm256_broadcast_sd( (double*)( b + 3 ) ); // a += DKS_MR; // b += DKS_NR; // c_tmp.v = _mm256_mul_pd( a03.v , b0.v ); // c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); // c_tmp.v = _mm256_mul_pd( a03.v , b1.v ); // c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); // c_tmp.v = _mm256_mul_pd( a03.v , b2.v ); // c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); // c_tmp.v = _mm256_mul_pd( a03.v , b3.v ); // c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b0.v ); // c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b1.v ); // c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b2.v ); // c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b3.v ); // c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); //} aa_tmp.v = _mm256_broadcast_sd( &neg2 ); //c03_0.v = _mm256_mul_pd( aa_tmp.v, c03_0.v ); //c03_1.v = _mm256_mul_pd( aa_tmp.v, c03_1.v ); //c03_2.v = _mm256_mul_pd( aa_tmp.v, c03_2.v ); //c03_3.v = _mm256_mul_pd( aa_tmp.v, c03_3.v ); //c47_0.v = _mm256_mul_pd( aa_tmp.v, c47_0.v ); //c47_1.v = _mm256_mul_pd( aa_tmp.v, c47_1.v ); //c47_2.v = _mm256_mul_pd( aa_tmp.v, c47_2.v ); //c47_3.v = _mm256_mul_pd( aa_tmp.v, c47_3.v ); // c03_0.v = _mm256_mul_pd( aa_tmp.v, c03_0.v ); c03_1.v = _mm256_mul_pd( aa_tmp.v, c03_1.v ); c03_2.v = _mm256_mul_pd( aa_tmp.v, c03_2.v ); c03_3.v = _mm256_mul_pd( aa_tmp.v, c03_3.v ); c47_0.v = _mm256_mul_pd( aa_tmp.v, c47_0.v ); c47_1.v = _mm256_mul_pd( aa_tmp.v, c47_1.v ); c47_2.v = _mm256_mul_pd( aa_tmp.v, c47_2.v ); c47_3.v = _mm256_mul_pd( aa_tmp.v, c47_3.v ); //printf( "scale -2 \n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); aa_tmp.v = _mm256_load_pd( (double*)aa ); c03_0.v = _mm256_add_pd( aa_tmp.v, c03_0.v ); c03_1.v = _mm256_add_pd( aa_tmp.v, c03_1.v ); c03_2.v = _mm256_add_pd( aa_tmp.v, c03_2.v ); c03_3.v = _mm256_add_pd( aa_tmp.v, c03_3.v ); //printf( "aa03 = %lf, %lf, %lf, %lf\n", aa_tmp.d[0], aa_tmp.d[1], aa_tmp.d[2], aa_tmp.d[3] ); //printf( "bb03 = %lf, %lf, %lf, %lf\n", bb[ 0 ], bb[ 1 ], bb[ 2 ], bb[ 3 ] ); aa_tmp.v = _mm256_load_pd( (double*)( aa + 4 ) ); c47_0.v = _mm256_add_pd( aa_tmp.v, c47_0.v ); c47_1.v = _mm256_add_pd( aa_tmp.v, c47_1.v ); c47_2.v = _mm256_add_pd( aa_tmp.v, c47_2.v ); c47_3.v = _mm256_add_pd( aa_tmp.v, c47_3.v ); //printf( "add a^2\n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); bb_tmp.v = _mm256_broadcast_sd( (double*)bb ); c03_0.v = _mm256_add_pd( bb_tmp.v, c03_0.v ); c47_0.v = _mm256_add_pd( bb_tmp.v, c47_0.v ); bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 1 ) ); c03_1.v = _mm256_add_pd( bb_tmp.v, c03_1.v ); c47_1.v = _mm256_add_pd( bb_tmp.v, c47_1.v ); bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 2 ) ); c03_2.v = _mm256_add_pd( bb_tmp.v, c03_2.v ); c47_2.v = _mm256_add_pd( bb_tmp.v, c47_2.v ); bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 3 ) ); c03_3.v = _mm256_add_pd( bb_tmp.v, c03_3.v ); c47_3.v = _mm256_add_pd( bb_tmp.v, c47_3.v ); // Check if there is any illegle value c_tmp.v = _mm256_broadcast_sd( &dzero ); c03_0.v = _mm256_max_pd( c_tmp.v, c03_0.v ); c03_1.v = _mm256_max_pd( c_tmp.v, c03_1.v ); c03_2.v = _mm256_max_pd( c_tmp.v, c03_2.v ); c03_3.v = _mm256_max_pd( c_tmp.v, c03_3.v ); c47_0.v = _mm256_max_pd( c_tmp.v, c47_0.v ); c47_1.v = _mm256_max_pd( c_tmp.v, c47_1.v ); c47_2.v = _mm256_max_pd( c_tmp.v, c47_2.v ); c47_3.v = _mm256_max_pd( c_tmp.v, c47_3.v ); // Transpose c03/c47 _0, _1, _2, _3 to be the row vector tmpc03_0.v = _mm256_shuffle_pd( c03_0.v, c03_1.v, 0x0 ); tmpc03_1.v = _mm256_shuffle_pd( c03_0.v, c03_1.v, 0xF ); tmpc03_2.v = _mm256_shuffle_pd( c03_2.v, c03_3.v, 0x0 ); tmpc03_3.v = _mm256_shuffle_pd( c03_2.v, c03_3.v, 0xF ); tmpc47_0.v = _mm256_shuffle_pd( c47_0.v, c47_1.v, 0x0 ); tmpc47_1.v = _mm256_shuffle_pd( c47_0.v, c47_1.v, 0xF ); tmpc47_2.v = _mm256_shuffle_pd( c47_2.v, c47_3.v, 0x0 ); tmpc47_3.v = _mm256_shuffle_pd( c47_2.v, c47_3.v, 0xF ); c03_0.v = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x20 ); c03_2.v = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x31 ); c03_1.v = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x20 ); c03_3.v = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x31 ); c47_0.v = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x20 ); c47_2.v = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x31 ); c47_1.v = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x20 ); c47_3.v = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x31 ); // c03_0; // c03_1; // c03_2; // c03_3; // c47_0; // c47_1; // c47_2; // c47_3; _mm256_store_pd( c , c03_0.v ); _mm256_store_pd( c + 4, c03_1.v ); _mm256_store_pd( c + 8, c03_2.v ); _mm256_store_pd( c + 12, c03_3.v ); _mm256_store_pd( c + 16, c47_0.v ); _mm256_store_pd( c + 20, c47_1.v ); _mm256_store_pd( c + 24, c47_2.v ); _mm256_store_pd( c + 28, c47_3.v ); }
void ks_gaussian_int_d8x4( int k, double alpha, double *u, double *aa, double *a, double *bb, double *b, double *w, aux_t *aux ) { int i; double neg2 = -2.0; double dzero = 0.0; v4df_t c03_0, c03_1, c03_2, c03_3; v4df_t c47_0, c47_1, c47_2, c47_3; v4df_t tmpc03_0, tmpc03_1, tmpc03_2, tmpc03_3; v4df_t tmpc47_0, tmpc47_1, tmpc47_2, tmpc47_3; v4df_t c_tmp; v4df_t u03; v4df_t u47; v4df_t a03, a47; v4df_t A03, A47; // prefetched A v4df_t b0, b1, b2, b3; v4df_t B0; // prefetched B v4df_t aa_tmp, bb_tmp; v4df_t w_tmp; //// Inline vdExp() //const double log2e = 1.4426950408889634073599; //const double maxlog = 7.09782712893383996843e2; // log( 2**1024 ) //const double minlog = -7.08396418532264106224e2; // log( 2**-1024 ) //const double one = 1.0; //const double c1 = 6.93145751953125E-1; //const double c2 = 1.42860682030941723212E-6; //// Original Remez Order 11 coefficients //const double w11 = 3.5524625185478232665958141148891055719216674475023e-8; //const double w10 = 2.5535368519306500343384723775435166753084614063349e-7; //const double w9 = 2.77750562801295315877005242757916081614772210463065e-6; //const double w8 = 2.47868893393199945541176652007657202642495832996107e-5; //const double w7 = 1.98419213985637881240770890090795533564573406893163e-4; //const double w6 = 1.3888869684178659239014256260881685824525255547326e-3; //const double w5 = 8.3333337052009872221152811550156335074160546333973e-3; //const double w4 = 4.1666666621080810610346717440523105184720007971655e-2; //const double w3 = 0.166666666669960803484477734308515404418108830469798; //const double w2 = 0.499999999999877094481580370323249951329122224389189; //const double w1 = 1.0000000000000017952745258419615282194236357388884; //const double w0 = 0.99999999999999999566016490920259318691496540598896; // Remez Order 11 polynomail approximation //const double w0 = 9.9999999999999999694541216787022234814339814028865e-1; //const double w1 = 1.0000000000000013347525109964212249781265243645457; //const double w2 = 4.9999999999990426011279542064313207349934058355357e-1; //const double w3 = 1.6666666666933781279020916199156875162816850273886e-1; //const double w4 = 4.1666666628388978913396218847247771982698350546174e-2; //const double w5 = 8.3333336552944126722390410619859929515740995889372e-3; //const double w6 = 1.3888871805082296012945081624687544823497126781709e-3; //const double w7 = 1.9841863599469418342286677256362193951266072398489e-4; //const double w8 = 2.4787899938611697691690479138150629377630767114546e-5; //const double w9 = 2.7764095757136528235740765949934667970688427190168e-6; //const double w10 = 2.5602485412126369546033948405199058329040797134573e-7; //const double w11 = 3.5347283721656121939634391175390704621351283546671e-8; // Remez Order 9 polynomail approximation // const double w0 = 9.9999999999998657717890998293462356769270934668652e-1; // const double w1 = 1.0000000000041078023971691258305486059867172736079; // const double w2 = 4.9999999979496223000111361187419539211772440139043e-1; // const double w3 = 1.6666667059968250851708016603646727895353772273675e-1; // const double w4 = 4.1666628655740875994884332519499013211594753124142e-2; // const double w5 = 8.3335428149736685441705398632467122758546893330069e-3; // const double w6 = 1.3881912931358424526285652289974115047170651985345e-3; // const double w7 = 1.9983735415194021112767942931416179152416729204150e-4; // const double w8 = 2.3068467290270483679711135625155862511780587976925e-5; // const double w9 = 3.8865682386514872192656192137071689334005518164704e-6; //v4df_t a03_0, a03_1, a03_2, a03_3; //v4df_t a47_0, a47_1, a47_2, a47_3; //v4df_t p03_0, p03_1, p03_2, p03_3; //v4df_t p47_0, p47_1, p47_2, p47_3; //v4df_t y, l2e, tmp, p; //v4li_t k03_0, k03_1, k03_2, k03_3; //v4li_t k47_0, k47_1, k47_2, k47_3; //v4li_t offset; //v4li_t k1, k2; //__m128d p1, p2; int k_iter = k / 2; int k_left = k % 2; __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( a ) ); __asm__ volatile( "prefetcht2 0(%0) \n\t" : :"r"( aux->b_next ) ); c03_0.v = _mm256_setzero_pd(); c03_1.v = _mm256_setzero_pd(); c03_2.v = _mm256_setzero_pd(); c03_3.v = _mm256_setzero_pd(); c47_0.v = _mm256_setzero_pd(); c47_1.v = _mm256_setzero_pd(); c47_2.v = _mm256_setzero_pd(); c47_3.v = _mm256_setzero_pd(); // Load a03 a03.v = _mm256_load_pd( (double*)a ); // Load a47 a47.v = _mm256_load_pd( (double*)( a + 4 ) ); // Load (b0,b1,b2,b3) b0.v = _mm256_load_pd( (double*)b ); for ( i = 0; i < k_iter; ++i ) { __asm__ volatile( "prefetcht0 192(%0) \n\t" : :"r"(a) ); // Preload A03 A03.v = _mm256_load_pd( (double*)( a + 8 ) ); c_tmp.v = _mm256_mul_pd( a03.v , b0.v ); c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); c_tmp.v = _mm256_mul_pd( a47.v , b0.v ); c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); // Preload A47 A47.v = _mm256_load_pd( (double*)( a + 12 ) ); // Shuffle b ( 1, 0, 3, 2 ) b1.v = _mm256_shuffle_pd( b0.v, b0.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b1.v ); c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); c_tmp.v = _mm256_mul_pd( a47.v , b1.v ); c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); // Permute b ( 3, 2, 1, 0 ) b2.v = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 ); // Preload B0 B0.v = _mm256_load_pd( (double*)( b + 4 ) ); c_tmp.v = _mm256_mul_pd( a03.v , b2.v ); c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); c_tmp.v = _mm256_mul_pd( a47.v , b2.v ); c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // Shuffle b ( 3, 2, 1, 0 ) b3.v = _mm256_shuffle_pd( b2.v, b2.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b3.v ); c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); c_tmp.v = _mm256_mul_pd( a47.v , b3.v ); c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); // Iteration #1 __asm__ volatile( "prefetcht0 512(%0) \n\t" : :"r"(a) ); // Preload a03 ( next iteration ) a03.v = _mm256_load_pd( (double*)( a + 16 ) ); c_tmp.v = _mm256_mul_pd( A03.v , B0.v ); c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); b1.v = _mm256_shuffle_pd( B0.v, B0.v, 0x5 ); c_tmp.v = _mm256_mul_pd( A47.v , B0.v ); c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); c_tmp.v = _mm256_mul_pd( A03.v , b1.v ); c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); // Preload a47 ( next iteration ) a47.v = _mm256_load_pd( (double*)( a + 20 ) ); // Permute b ( 3, 2, 1, 0 ) b2.v = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 ); c_tmp.v = _mm256_mul_pd( A47.v , b1.v ); c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); c_tmp.v = _mm256_mul_pd( A03.v , b2.v ); c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); // Shuffle b ( 3, 2, 1, 0 ) b3.v = _mm256_shuffle_pd( b2.v, b2.v, 0x5 ); c_tmp.v = _mm256_mul_pd( A47.v , b2.v ); c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // Load b0 ( next iteration ) b0.v = _mm256_load_pd( (double*)( b + 8 ) ); c_tmp.v = _mm256_mul_pd( A03.v , b3.v ); c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); c_tmp.v = _mm256_mul_pd( A47.v , b3.v ); c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); a += 16; b += 8; } for ( i = 0; i < k_left; ++i ) { a03.v = _mm256_load_pd( (double*)a ); //printf( "a03 = %lf, %lf, %lf, %lf\n", a03.d[0], a03.d[1], a03.d[2], a03.d[3] ); a47.v = _mm256_load_pd( (double*)( a + 4 ) ); //printf( "a47 = %lf, %lf, %lf, %lf\n", a47.d[0], a47.d[1], a47.d[2], a47.d[3] ); b0.v = _mm256_load_pd( (double*)b ); //printf( "b0 = %lf, %lf, %lf, %lf\n", b0.d[0], b0.d[1], b0.d[2], b0.d[3] ); c_tmp.v = _mm256_mul_pd( a03.v , b0.v ); c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); c_tmp.v = _mm256_mul_pd( a47.v , b0.v ); c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); // Shuffle b ( 1, 0, 3, 2 ) b1.v = _mm256_shuffle_pd( b0.v, b0.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b1.v ); c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); c_tmp.v = _mm256_mul_pd( a47.v , b1.v ); c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); // Permute b ( 3, 2, 1, 0 ) b2.v = _mm256_permute2f128_pd( b1.v, b1.v, 0x1 ); c_tmp.v = _mm256_mul_pd( a03.v , b2.v ); c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); c_tmp.v = _mm256_mul_pd( a47.v , b2.v ); c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // Shuffle b ( 3, 2, 1, 0 ) b3.v = _mm256_shuffle_pd( b2.v, b2.v, 0x5 ); c_tmp.v = _mm256_mul_pd( a03.v , b3.v ); c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); c_tmp.v = _mm256_mul_pd( a47.v , b3.v ); c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); a += 8; b += 4; } // Prefetch aa and bb __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( aa ) ); __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( bb ) ); tmpc03_0.v = _mm256_blend_pd( c03_0.v, c03_1.v, 0x6 ); tmpc03_1.v = _mm256_blend_pd( c03_1.v, c03_0.v, 0x6 ); tmpc03_2.v = _mm256_blend_pd( c03_2.v, c03_3.v, 0x6 ); tmpc03_3.v = _mm256_blend_pd( c03_3.v, c03_2.v, 0x6 ); tmpc47_0.v = _mm256_blend_pd( c47_0.v, c47_1.v, 0x6 ); tmpc47_1.v = _mm256_blend_pd( c47_1.v, c47_0.v, 0x6 ); tmpc47_2.v = _mm256_blend_pd( c47_2.v, c47_3.v, 0x6 ); tmpc47_3.v = _mm256_blend_pd( c47_3.v, c47_2.v, 0x6 ); c03_0.v = _mm256_permute2f128_pd( tmpc03_0.v, tmpc03_2.v, 0x30 ); c03_3.v = _mm256_permute2f128_pd( tmpc03_2.v, tmpc03_0.v, 0x30 ); c03_1.v = _mm256_permute2f128_pd( tmpc03_1.v, tmpc03_3.v, 0x30 ); c03_2.v = _mm256_permute2f128_pd( tmpc03_3.v, tmpc03_1.v, 0x30 ); c47_0.v = _mm256_permute2f128_pd( tmpc47_0.v, tmpc47_2.v, 0x30 ); c47_3.v = _mm256_permute2f128_pd( tmpc47_2.v, tmpc47_0.v, 0x30 ); c47_1.v = _mm256_permute2f128_pd( tmpc47_1.v, tmpc47_3.v, 0x30 ); c47_2.v = _mm256_permute2f128_pd( tmpc47_3.v, tmpc47_1.v, 0x30 ); //printf( "rank-k\n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); //for ( i = 0; i < k; i++ ) { // a03.v = _mm256_load_pd( (double*)a ); // a47.v = _mm256_load_pd( (double*)( a + 4 ) ); // b0.v = _mm256_broadcast_sd( (double*)b ); // b1.v = _mm256_broadcast_sd( (double*)( b + 1 ) ); // b2.v = _mm256_broadcast_sd( (double*)( b + 2 ) ); // b3.v = _mm256_broadcast_sd( (double*)( b + 3 ) ); // a += DKS_MR; // b += DKS_NR; // c_tmp.v = _mm256_mul_pd( a03.v , b0.v ); // c03_0.v = _mm256_add_pd( c_tmp.v, c03_0.v ); // c_tmp.v = _mm256_mul_pd( a03.v , b1.v ); // c03_1.v = _mm256_add_pd( c_tmp.v, c03_1.v ); // c_tmp.v = _mm256_mul_pd( a03.v , b2.v ); // c03_2.v = _mm256_add_pd( c_tmp.v, c03_2.v ); // c_tmp.v = _mm256_mul_pd( a03.v , b3.v ); // c03_3.v = _mm256_add_pd( c_tmp.v, c03_3.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b0.v ); // c47_0.v = _mm256_add_pd( c_tmp.v, c47_0.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b1.v ); // c47_1.v = _mm256_add_pd( c_tmp.v, c47_1.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b2.v ); // c47_2.v = _mm256_add_pd( c_tmp.v, c47_2.v ); // c_tmp.v = _mm256_mul_pd( a47.v , b3.v ); // c47_3.v = _mm256_add_pd( c_tmp.v, c47_3.v ); //} aa_tmp.v = _mm256_broadcast_sd( &neg2 ); //c03_0.v = _mm256_mul_pd( aa_tmp.v, c03_0.v ); //c03_1.v = _mm256_mul_pd( aa_tmp.v, c03_1.v ); //c03_2.v = _mm256_mul_pd( aa_tmp.v, c03_2.v ); //c03_3.v = _mm256_mul_pd( aa_tmp.v, c03_3.v ); //c47_0.v = _mm256_mul_pd( aa_tmp.v, c47_0.v ); //c47_1.v = _mm256_mul_pd( aa_tmp.v, c47_1.v ); //c47_2.v = _mm256_mul_pd( aa_tmp.v, c47_2.v ); //c47_3.v = _mm256_mul_pd( aa_tmp.v, c47_3.v ); // c03_0.v = _mm256_mul_pd( aa_tmp.v, c03_0.v ); c03_1.v = _mm256_mul_pd( aa_tmp.v, c03_1.v ); c03_2.v = _mm256_mul_pd( aa_tmp.v, c03_2.v ); c03_3.v = _mm256_mul_pd( aa_tmp.v, c03_3.v ); c47_0.v = _mm256_mul_pd( aa_tmp.v, c47_0.v ); c47_1.v = _mm256_mul_pd( aa_tmp.v, c47_1.v ); c47_2.v = _mm256_mul_pd( aa_tmp.v, c47_2.v ); c47_3.v = _mm256_mul_pd( aa_tmp.v, c47_3.v ); //printf( "scale -2 \n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); aa_tmp.v = _mm256_load_pd( (double*)aa ); c03_0.v = _mm256_add_pd( aa_tmp.v, c03_0.v ); c03_1.v = _mm256_add_pd( aa_tmp.v, c03_1.v ); c03_2.v = _mm256_add_pd( aa_tmp.v, c03_2.v ); c03_3.v = _mm256_add_pd( aa_tmp.v, c03_3.v ); //printf( "aa03 = %lf, %lf, %lf, %lf\n", aa_tmp.d[0], aa_tmp.d[1], aa_tmp.d[2], aa_tmp.d[3] ); //printf( "bb03 = %lf, %lf, %lf, %lf\n", bb[ 0 ], bb[ 1 ], bb[ 2 ], bb[ 3 ] ); aa_tmp.v = _mm256_load_pd( (double*)( aa + 4 ) ); c47_0.v = _mm256_add_pd( aa_tmp.v, c47_0.v ); c47_1.v = _mm256_add_pd( aa_tmp.v, c47_1.v ); c47_2.v = _mm256_add_pd( aa_tmp.v, c47_2.v ); c47_3.v = _mm256_add_pd( aa_tmp.v, c47_3.v ); //printf( "add a^2\n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); // Prefetch u __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( u ) ); bb_tmp.v = _mm256_broadcast_sd( (double*)bb ); c03_0.v = _mm256_add_pd( bb_tmp.v, c03_0.v ); c47_0.v = _mm256_add_pd( bb_tmp.v, c47_0.v ); bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 1 ) ); c03_1.v = _mm256_add_pd( bb_tmp.v, c03_1.v ); c47_1.v = _mm256_add_pd( bb_tmp.v, c47_1.v ); bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 2 ) ); c03_2.v = _mm256_add_pd( bb_tmp.v, c03_2.v ); c47_2.v = _mm256_add_pd( bb_tmp.v, c47_2.v ); bb_tmp.v = _mm256_broadcast_sd( (double*)( bb + 3 ) ); c03_3.v = _mm256_add_pd( bb_tmp.v, c03_3.v ); c47_3.v = _mm256_add_pd( bb_tmp.v, c47_3.v ); // Check if there is any illegle value c_tmp.v = _mm256_broadcast_sd( &dzero ); c03_0.v = _mm256_max_pd( c_tmp.v, c03_0.v ); c03_1.v = _mm256_max_pd( c_tmp.v, c03_1.v ); c03_2.v = _mm256_max_pd( c_tmp.v, c03_2.v ); c03_3.v = _mm256_max_pd( c_tmp.v, c03_3.v ); c47_0.v = _mm256_max_pd( c_tmp.v, c47_0.v ); c47_1.v = _mm256_max_pd( c_tmp.v, c47_1.v ); c47_2.v = _mm256_max_pd( c_tmp.v, c47_2.v ); c47_3.v = _mm256_max_pd( c_tmp.v, c47_3.v ); // Scale before the kernel evaluation aa_tmp.v = _mm256_broadcast_sd( &alpha ); c03_0.v = _mm256_mul_pd( aa_tmp.v, c03_0.v ); c03_1.v = _mm256_mul_pd( aa_tmp.v, c03_1.v ); c03_2.v = _mm256_mul_pd( aa_tmp.v, c03_2.v ); c03_3.v = _mm256_mul_pd( aa_tmp.v, c03_3.v ); c47_0.v = _mm256_mul_pd( aa_tmp.v, c47_0.v ); c47_1.v = _mm256_mul_pd( aa_tmp.v, c47_1.v ); c47_2.v = _mm256_mul_pd( aa_tmp.v, c47_2.v ); c47_3.v = _mm256_mul_pd( aa_tmp.v, c47_3.v ); // Preload u03, u47 u03.v = _mm256_load_pd( (double*)u ); u47.v = _mm256_load_pd( (double*)( u + 4 ) ); // Prefetch w __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( w ) ); #include "ks_exp_int_d8x4.h" //printf( "square distance\n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); //for ( i = 0; i < 4; i++ ) { // if ( c03_0.d[ i ] != c03_0.d[ i ] ) { // printf( "error Nan: c03_0[ %d ]\n", i ); // } // if ( c03_1.d[ i ] != c03_1.d[ i ] ) { // printf( "error Nan: c03_1[ %d ]\n", i ); // } // if ( c03_2.d[ i ] != c03_2.d[ i ] ) { // printf( "error Nan: c03_2[ %d ]\n", i ); // } // if ( c03_3.d[ i ] != c03_3.d[ i ] ) { // printf( "error Nan: c03_3[ %d ]\n", i ); // } // if ( c47_0.d[ i ] != c47_0.d[ i ] ) { // printf( "error Nan: c47_0[ %d ]\n", i ); // } // if ( c47_1.d[ i ] != c47_1.d[ i ] ) { // printf( "error Nan: c47_1[ %d ]\n", i ); // } // if ( c47_2.d[ i ] != c47_2.d[ i ] ) { // printf( "error Nan: c47_2[ %d ]\n", i ); // } // if ( c47_3.d[ i ] != c47_3.d[ i ] ) { // printf( "error Nan: c47_3[ %d ]\n", i ); // } //} // tmp.v = _mm256_broadcast_sd( &maxlog ); // c03_0.v = _mm256_min_pd( tmp.v, c03_0.v ); // c03_1.v = _mm256_min_pd( tmp.v, c03_1.v ); // c03_2.v = _mm256_min_pd( tmp.v, c03_2.v ); // c03_3.v = _mm256_min_pd( tmp.v, c03_3.v ); // c47_0.v = _mm256_min_pd( tmp.v, c47_0.v ); // c47_1.v = _mm256_min_pd( tmp.v, c47_1.v ); // c47_2.v = _mm256_min_pd( tmp.v, c47_2.v ); // c47_3.v = _mm256_min_pd( tmp.v, c47_3.v ); // tmp.v = _mm256_broadcast_sd( &minlog ); // c03_0.v = _mm256_max_pd( tmp.v, c03_0.v ); // c03_1.v = _mm256_max_pd( tmp.v, c03_1.v ); // c03_2.v = _mm256_max_pd( tmp.v, c03_2.v ); // c03_3.v = _mm256_max_pd( tmp.v, c03_3.v ); // c47_0.v = _mm256_max_pd( tmp.v, c47_0.v ); // c47_1.v = _mm256_max_pd( tmp.v, c47_1.v ); // c47_2.v = _mm256_max_pd( tmp.v, c47_2.v ); // c47_3.v = _mm256_max_pd( tmp.v, c47_3.v ); // // // a = c / log2e // // c = a * ln2 = k * ln2 + w, ( w in [ -ln2, ln2 ] ) // l2e.v = _mm256_broadcast_sd( &log2e ); // a03_0.v = _mm256_mul_pd( l2e.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( l2e.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( l2e.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( l2e.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( l2e.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( l2e.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( l2e.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( l2e.v, c47_3.v ); // // // Check if a < 0 // tmp.v = _mm256_setzero_pd(); // p03_0.v = _mm256_cmp_pd( a03_0.v, tmp.v, 1 ); // p03_1.v = _mm256_cmp_pd( a03_1.v, tmp.v, 1 ); // p03_2.v = _mm256_cmp_pd( a03_2.v, tmp.v, 1 ); // p03_3.v = _mm256_cmp_pd( a03_3.v, tmp.v, 1 ); // p47_0.v = _mm256_cmp_pd( a47_0.v, tmp.v, 1 ); // p47_1.v = _mm256_cmp_pd( a47_1.v, tmp.v, 1 ); // p47_2.v = _mm256_cmp_pd( a47_2.v, tmp.v, 1 ); // p47_3.v = _mm256_cmp_pd( a47_3.v, tmp.v, 1 ); // tmp.v = _mm256_broadcast_sd( &one ); // p03_0.v = _mm256_and_pd( tmp.v, p03_0.v ); // p03_1.v = _mm256_and_pd( tmp.v, p03_1.v ); // p03_2.v = _mm256_and_pd( tmp.v, p03_2.v ); // p03_3.v = _mm256_and_pd( tmp.v, p03_3.v ); // p47_0.v = _mm256_and_pd( tmp.v, p47_0.v ); // p47_1.v = _mm256_and_pd( tmp.v, p47_1.v ); // p47_2.v = _mm256_and_pd( tmp.v, p47_2.v ); // p47_3.v = _mm256_and_pd( tmp.v, p47_3.v ); // // If a < 0 ( w < 0 ), then a - 1 = ( k - 1 ) + w / ln2 // a03_0.v = _mm256_sub_pd( a03_0.v, p03_0.v ); // a03_1.v = _mm256_sub_pd( a03_1.v, p03_1.v ); // a03_2.v = _mm256_sub_pd( a03_2.v, p03_2.v ); // a03_3.v = _mm256_sub_pd( a03_3.v, p03_3.v ); // a47_0.v = _mm256_sub_pd( a47_0.v, p47_0.v ); // a47_1.v = _mm256_sub_pd( a47_1.v, p47_1.v ); // a47_2.v = _mm256_sub_pd( a47_2.v, p47_2.v ); // a47_3.v = _mm256_sub_pd( a47_3.v, p47_3.v ); // // Compute floor( a ) by two conversions // // if a < 0, p = k - 1 // // else , p = k // k03_0.v = _mm256_cvttpd_epi32( a03_0.v ); // k03_1.v = _mm256_cvttpd_epi32( a03_1.v ); // k03_2.v = _mm256_cvttpd_epi32( a03_2.v ); // k03_3.v = _mm256_cvttpd_epi32( a03_3.v ); // k47_0.v = _mm256_cvttpd_epi32( a47_0.v ); // k47_1.v = _mm256_cvttpd_epi32( a47_1.v ); // k47_2.v = _mm256_cvttpd_epi32( a47_2.v ); // k47_3.v = _mm256_cvttpd_epi32( a47_3.v ); // p03_0.v = _mm256_cvtepi32_pd( k03_0.v ); // p03_1.v = _mm256_cvtepi32_pd( k03_1.v ); // p03_2.v = _mm256_cvtepi32_pd( k03_2.v ); // p03_3.v = _mm256_cvtepi32_pd( k03_3.v ); // p47_0.v = _mm256_cvtepi32_pd( k47_0.v ); // p47_1.v = _mm256_cvtepi32_pd( k47_1.v ); // p47_2.v = _mm256_cvtepi32_pd( k47_2.v ); // p47_3.v = _mm256_cvtepi32_pd( k47_3.v ); // // // --------------------- // // x -= p * ln2 // // --------------------- // // c1 = ln2 // // if a < 0, a = ( k - 1 ) * ln2 // // else , a = k * ln2 // // if a < 0, x -= ( k - 1 ) * ln2 // // else , x -= k * ln2 // // // tmp.v = _mm256_broadcast_sd( &c1 ); // a03_0.v = _mm256_mul_pd( tmp.v, p03_0.v ); // a03_1.v = _mm256_mul_pd( tmp.v, p03_1.v ); // a03_2.v = _mm256_mul_pd( tmp.v, p03_2.v ); // a03_3.v = _mm256_mul_pd( tmp.v, p03_3.v ); // a47_0.v = _mm256_mul_pd( tmp.v, p47_0.v ); // a47_1.v = _mm256_mul_pd( tmp.v, p47_1.v ); // a47_2.v = _mm256_mul_pd( tmp.v, p47_2.v ); // a47_3.v = _mm256_mul_pd( tmp.v, p47_3.v ); // c03_0.v = _mm256_sub_pd( c03_0.v, a03_0.v ); // c03_1.v = _mm256_sub_pd( c03_1.v, a03_1.v ); // c03_2.v = _mm256_sub_pd( c03_2.v, a03_2.v ); // c03_3.v = _mm256_sub_pd( c03_3.v, a03_3.v ); // c47_0.v = _mm256_sub_pd( c47_0.v, a47_0.v ); // c47_1.v = _mm256_sub_pd( c47_1.v, a47_1.v ); // c47_2.v = _mm256_sub_pd( c47_2.v, a47_2.v ); // c47_3.v = _mm256_sub_pd( c47_3.v, a47_3.v ); // tmp.v = _mm256_broadcast_sd( &c2 ); // a03_0.v = _mm256_mul_pd( tmp.v, p03_0.v ); // a03_1.v = _mm256_mul_pd( tmp.v, p03_1.v ); // a03_2.v = _mm256_mul_pd( tmp.v, p03_2.v ); // a03_3.v = _mm256_mul_pd( tmp.v, p03_3.v ); // a47_0.v = _mm256_mul_pd( tmp.v, p47_0.v ); // a47_1.v = _mm256_mul_pd( tmp.v, p47_1.v ); // a47_2.v = _mm256_mul_pd( tmp.v, p47_2.v ); // a47_3.v = _mm256_mul_pd( tmp.v, p47_3.v ); // c03_0.v = _mm256_sub_pd( c03_0.v, a03_0.v ); // c03_1.v = _mm256_sub_pd( c03_1.v, a03_1.v ); // c03_2.v = _mm256_sub_pd( c03_2.v, a03_2.v ); // c03_3.v = _mm256_sub_pd( c03_3.v, a03_3.v ); // c47_0.v = _mm256_sub_pd( c47_0.v, a47_0.v ); // c47_1.v = _mm256_sub_pd( c47_1.v, a47_1.v ); // c47_2.v = _mm256_sub_pd( c47_2.v, a47_2.v ); // c47_3.v = _mm256_sub_pd( c47_3.v, a47_3.v ); // // // //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); // //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); // //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); // //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); // //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); // //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); // //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); // //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); // // // // Prefetch u // __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( u ) ); // // // // // Compute e^x using polynomial approximation // // a = w10 + w11 * x // tmp.v = _mm256_broadcast_sd( &w11 ); // //tmp.v = _mm256_broadcast_sd( &w9 ); // a03_0.v = _mm256_mul_pd( c03_0.v, tmp.v ); // a03_1.v = _mm256_mul_pd( c03_1.v, tmp.v ); // a03_2.v = _mm256_mul_pd( c03_2.v, tmp.v ); // a03_3.v = _mm256_mul_pd( c03_3.v, tmp.v ); // a47_0.v = _mm256_mul_pd( c47_0.v, tmp.v ); // a47_1.v = _mm256_mul_pd( c47_1.v, tmp.v ); // a47_2.v = _mm256_mul_pd( c47_2.v, tmp.v ); // a47_3.v = _mm256_mul_pd( c47_3.v, tmp.v ); // tmp.v = _mm256_broadcast_sd( &w10 ); // //tmp.v = _mm256_broadcast_sd( &w8 ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // // // // a = w8 + ( w9 + ( w10 + w11 * x ) * x ) * x // tmp.v = _mm256_broadcast_sd( &w9 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // tmp.v = _mm256_broadcast_sd( &w8 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // // // tmp.v = _mm256_broadcast_sd( &w7 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // tmp.v = _mm256_broadcast_sd( &w6 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // // // tmp.v = _mm256_broadcast_sd( &w5 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // tmp.v = _mm256_broadcast_sd( &w4 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // // // // Prefetch w // __asm__ volatile( "prefetcht0 0(%0) \n\t" : :"r"( w ) ); // // Preload u03 // u03.v = _mm256_load_pd( (double*)u ); // // // tmp.v = _mm256_broadcast_sd( &w3 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // tmp.v = _mm256_broadcast_sd( &w2 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // // // tmp.v = _mm256_broadcast_sd( &w1 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // tmp.v = _mm256_broadcast_sd( &w0 ); // a03_0.v = _mm256_mul_pd( a03_0.v, c03_0.v ); // a03_1.v = _mm256_mul_pd( a03_1.v, c03_1.v ); // a03_2.v = _mm256_mul_pd( a03_2.v, c03_2.v ); // a03_3.v = _mm256_mul_pd( a03_3.v, c03_3.v ); // a47_0.v = _mm256_mul_pd( a47_0.v, c47_0.v ); // a47_1.v = _mm256_mul_pd( a47_1.v, c47_1.v ); // a47_2.v = _mm256_mul_pd( a47_2.v, c47_2.v ); // a47_3.v = _mm256_mul_pd( a47_3.v, c47_3.v ); // a03_0.v = _mm256_add_pd( a03_0.v, tmp.v ); // a03_1.v = _mm256_add_pd( a03_1.v, tmp.v ); // a03_2.v = _mm256_add_pd( a03_2.v, tmp.v ); // a03_3.v = _mm256_add_pd( a03_3.v, tmp.v ); // a47_0.v = _mm256_add_pd( a47_0.v, tmp.v ); // a47_1.v = _mm256_add_pd( a47_1.v, tmp.v ); // a47_2.v = _mm256_add_pd( a47_2.v, tmp.v ); // a47_3.v = _mm256_add_pd( a47_3.v, tmp.v ); // // // // Preload u47 // u47.v = _mm256_load_pd( (double*)( u + 4 ) ); // // // offset.v = _mm_setr_epi32( 1023, 1023, 0, 0 ); // k1.v = _mm_set_epi32( 0, 0, k03_0.d[ 1 ], k03_0.d[ 0 ]); // k2.v = _mm_set_epi32( 0, 0, k03_0.d[ 3 ], k03_0.d[ 2 ]); // k1.v = _mm_add_epi32( k1.v, offset.v ); // k2.v = _mm_add_epi32( k2.v, offset.v ); // k1.v = _mm_slli_epi32( k1.v, 20 ); // k2.v = _mm_slli_epi32( k2.v, 20 ); // k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // p1 = _mm_castsi128_pd( k1.v ); // p2 = _mm_castsi128_pd( k2.v ); // p03_0.v = _mm256_set_m128d( p2, p1 ); // k1.v = _mm_set_epi32( 0, 0, k03_1.d[ 1 ], k03_1.d[ 0 ]); // k2.v = _mm_set_epi32( 0, 0, k03_1.d[ 3 ], k03_1.d[ 2 ]); // k1.v = _mm_add_epi32( k1.v, offset.v ); // k2.v = _mm_add_epi32( k2.v, offset.v ); // k1.v = _mm_slli_epi32( k1.v, 20 ); // k2.v = _mm_slli_epi32( k2.v, 20 ); // k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // p1 = _mm_castsi128_pd( k1.v ); // p2 = _mm_castsi128_pd( k2.v ); // p03_1.v = _mm256_set_m128d( p2, p1 ); // k1.v = _mm_set_epi32( 0, 0, k03_2.d[ 1 ], k03_2.d[ 0 ]); // k2.v = _mm_set_epi32( 0, 0, k03_2.d[ 3 ], k03_2.d[ 2 ]); // k1.v = _mm_add_epi32( k1.v, offset.v ); // k2.v = _mm_add_epi32( k2.v, offset.v ); // k1.v = _mm_slli_epi32( k1.v, 20 ); // k2.v = _mm_slli_epi32( k2.v, 20 ); // k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // p1 = _mm_castsi128_pd( k1.v ); // p2 = _mm_castsi128_pd( k2.v ); // p03_2.v = _mm256_set_m128d( p2, p1 ); // k1.v = _mm_set_epi32( 0, 0, k03_3.d[ 1 ], k03_3.d[ 0 ]); // k2.v = _mm_set_epi32( 0, 0, k03_3.d[ 3 ], k03_3.d[ 2 ]); // k1.v = _mm_add_epi32( k1.v, offset.v ); // k2.v = _mm_add_epi32( k2.v, offset.v ); // k1.v = _mm_slli_epi32( k1.v, 20 ); // k2.v = _mm_slli_epi32( k2.v, 20 ); // k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // p1 = _mm_castsi128_pd( k1.v ); // p2 = _mm_castsi128_pd( k2.v ); // p03_3.v = _mm256_set_m128d( p2, p1 ); // k1.v = _mm_set_epi32( 0, 0, k47_0.d[ 1 ], k47_0.d[ 0 ]); // k2.v = _mm_set_epi32( 0, 0, k47_0.d[ 3 ], k47_0.d[ 2 ]); // k1.v = _mm_add_epi32( k1.v, offset.v ); // k2.v = _mm_add_epi32( k2.v, offset.v ); // k1.v = _mm_slli_epi32( k1.v, 20 ); // k2.v = _mm_slli_epi32( k2.v, 20 ); // k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // p1 = _mm_castsi128_pd( k1.v ); // p2 = _mm_castsi128_pd( k2.v ); // p47_0.v = _mm256_set_m128d( p2, p1 ); // k1.v = _mm_set_epi32( 0, 0, k47_1.d[ 1 ], k47_1.d[ 0 ]); // k2.v = _mm_set_epi32( 0, 0, k47_1.d[ 3 ], k47_1.d[ 2 ]); // k1.v = _mm_add_epi32( k1.v, offset.v ); // k2.v = _mm_add_epi32( k2.v, offset.v ); // k1.v = _mm_slli_epi32( k1.v, 20 ); // k2.v = _mm_slli_epi32( k2.v, 20 ); // k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // p1 = _mm_castsi128_pd( k1.v ); // p2 = _mm_castsi128_pd( k2.v ); // p47_1.v = _mm256_set_m128d( p2, p1 ); // k1.v = _mm_set_epi32( 0, 0, k47_2.d[ 1 ], k47_2.d[ 0 ]); // k2.v = _mm_set_epi32( 0, 0, k47_2.d[ 3 ], k47_2.d[ 2 ]); // k1.v = _mm_add_epi32( k1.v, offset.v ); // k2.v = _mm_add_epi32( k2.v, offset.v ); // k1.v = _mm_slli_epi32( k1.v, 20 ); // k2.v = _mm_slli_epi32( k2.v, 20 ); // k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // p1 = _mm_castsi128_pd( k1.v ); // p2 = _mm_castsi128_pd( k2.v ); // p47_2.v = _mm256_set_m128d( p2, p1 ); // k1.v = _mm_set_epi32( 0, 0, k47_3.d[ 1 ], k47_3.d[ 0 ]); // k2.v = _mm_set_epi32( 0, 0, k47_3.d[ 3 ], k47_3.d[ 2 ]); // k1.v = _mm_add_epi32( k1.v, offset.v ); // k2.v = _mm_add_epi32( k2.v, offset.v ); // k1.v = _mm_slli_epi32( k1.v, 20 ); // k2.v = _mm_slli_epi32( k2.v, 20 ); // k1.v = _mm_shuffle_epi32( k1.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // k2.v = _mm_shuffle_epi32( k2.v, _MM_SHUFFLE( 1, 3, 0, 2 ) ); // p1 = _mm_castsi128_pd( k1.v ); // p2 = _mm_castsi128_pd( k2.v ); // p47_3.v = _mm256_set_m128d( p2, p1 ); // // // //u03.v = _mm256_load_pd( (double*)u ); // //u47.v = _mm256_load_pd( (double*)( u + 4 ) ); // // // c03_0.v = _mm256_mul_pd( a03_0.v, p03_0.v ); // c03_1.v = _mm256_mul_pd( a03_1.v, p03_1.v ); // c03_2.v = _mm256_mul_pd( a03_2.v, p03_2.v ); // c03_3.v = _mm256_mul_pd( a03_3.v, p03_3.v ); // c47_0.v = _mm256_mul_pd( a47_0.v, p47_0.v ); // c47_1.v = _mm256_mul_pd( a47_1.v, p47_1.v ); // c47_2.v = _mm256_mul_pd( a47_2.v, p47_2.v ); // c47_3.v = _mm256_mul_pd( a47_3.v, p47_3.v ); //for ( i = 0; i < 4; i++ ) { // if ( c03_0.d[ i ] != c03_0.d[ i ] ) { // printf( "error exp Nan: c03_0[ %d ]\n", i ); // } // if ( c03_1.d[ i ] != c03_1.d[ i ] ) { // printf( "error exp Nan: c03_1[ %d ]\n", i ); // } // if ( c03_2.d[ i ] != c03_2.d[ i ] ) { // printf( "error exp Nan: c03_2[ %d ]\n", i ); // } // if ( c03_3.d[ i ] != c03_3.d[ i ] ) { // printf( "error exp Nan: c03_3[ %d ]\n", i ); // } // if ( c47_0.d[ i ] != c47_0.d[ i ] ) { // printf( "error exp Nan: c47_0[ %d ]\n", i ); // } // if ( c47_1.d[ i ] != c47_1.d[ i ] ) { // printf( "error exp Nan: c47_1[ %d ]\n", i ); // } // if ( c47_2.d[ i ] != c47_2.d[ i ] ) { // printf( "error exp Nan: c47_2[ %d ]\n", i ); // } // if ( c47_3.d[ i ] != c47_3.d[ i ] ) { // printf( "error exp Nan: c47_3[ %d ]\n", i ); // } //} //printf( "exp\n" ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[0], c03_1.d[0], c03_2.d[0], c03_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[1], c03_1.d[1], c03_2.d[1], c03_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[2], c03_1.d[2], c03_2.d[2], c03_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c03_0.d[3], c03_1.d[3], c03_2.d[3], c03_3.d[3] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[0], c47_1.d[0], c47_2.d[0], c47_3.d[0] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[1], c47_1.d[1], c47_2.d[1], c47_3.d[1] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[2], c47_1.d[2], c47_2.d[2], c47_3.d[2] ); //printf( "%lf, %lf, %lf, %lf\n", c47_0.d[3], c47_1.d[3], c47_2.d[3], c47_3.d[3] ); //printf( "w\n" ); //printf( "%lf, %lf, %lf, %lf\n", w[0], w[3], w[3], w[3] ); //u03.v = _mm256_load_pd( (double*)u ); //u47.v = _mm256_load_pd( (double*)( u + 4 ) ); w_tmp.v = _mm256_broadcast_sd( (double*)w ); c03_0.v = _mm256_mul_pd( w_tmp.v, c03_0.v ); c47_0.v = _mm256_mul_pd( w_tmp.v, c47_0.v ); u03.v = _mm256_add_pd( u03.v, c03_0.v ); u47.v = _mm256_add_pd( u47.v, c47_0.v ); //for ( i = 0; i < 4; i++ ) { // if ( w_tmp.d[ i ] != w_tmp.d[ i ] ) { // printf( "error w_tmp Nan: w_tmp[ %d ]\n", i ); // } //} w_tmp.v = _mm256_broadcast_sd( (double*)( w + 1 ) ); c03_1.v = _mm256_mul_pd( w_tmp.v, c03_1.v ); c47_1.v = _mm256_mul_pd( w_tmp.v, c47_1.v ); u03.v = _mm256_add_pd( u03.v, c03_1.v ); u47.v = _mm256_add_pd( u47.v, c47_1.v ); //for ( i = 0; i < 4; i++ ) { // if ( w_tmp.d[ i ] != w_tmp.d[ i ] ) { // printf( "error w_tmp Nan: w_tmp[ %d ]\n", i ); // } //} w_tmp.v = _mm256_broadcast_sd( (double*)( w + 2 ) ); c03_2.v = _mm256_mul_pd( w_tmp.v, c03_2.v ); c47_2.v = _mm256_mul_pd( w_tmp.v, c47_2.v ); u03.v = _mm256_add_pd( u03.v, c03_2.v ); u47.v = _mm256_add_pd( u47.v, c47_2.v ); //for ( i = 0; i < 4; i++ ) { // if ( w_tmp.d[ i ] != w_tmp.d[ i ] ) { // printf( "error w_tmp Nan: w_tmp[ %d ]\n", i ); // } //} w_tmp.v = _mm256_broadcast_sd( (double*)( w + 3 ) ); c03_3.v = _mm256_mul_pd( w_tmp.v, c03_3.v ); c47_3.v = _mm256_mul_pd( w_tmp.v, c47_3.v ); u03.v = _mm256_add_pd( u03.v, c03_3.v ); u47.v = _mm256_add_pd( u47.v, c47_3.v ); //for ( i = 0; i < 4; i++ ) { // if ( w_tmp.d[ i ] != w_tmp.d[ i ] ) { // printf( "error w_tmp Nan: w_tmp[ %d ]\n", i ); // } //} _mm256_store_pd( (double*)u, u03.v ); _mm256_store_pd( (double*)( u + 4 ), u47.v ); //for ( i = 0; i < 4; i++ ) { // if ( c03_0.d[ i ] != c03_0.d[ i ] ) { // printf( "error gemv Nan: c03_0[ %d ]\n", i ); // exit( 1 ); // } // if ( c03_1.d[ i ] != c03_1.d[ i ] ) { // printf( "error gemv Nan: c03_1[ %d ]\n", i ); // exit( 1 ); // } // if ( c03_2.d[ i ] != c03_2.d[ i ] ) { // printf( "error gemv Nan: c03_2[ %d ]\n", i ); // exit( 1 ); // } // if ( c03_3.d[ i ] != c03_3.d[ i ] ) { // printf( "error gemv Nan: c03_3[ %d ]\n", i ); // exit( 1 ); // } // if ( c47_0.d[ i ] != c47_0.d[ i ] ) { // printf( "error gemv Nan: c47_0[ %d ]\n", i ); // exit( 1 ); // } // if ( c47_1.d[ i ] != c47_1.d[ i ] ) { // printf( "error gemv Nan: c47_1[ %d ]\n", i ); // exit( 1 ); // } // if ( c47_2.d[ i ] != c47_2.d[ i ] ) { // printf( "error gemv Nan: c47_2[ %d ]\n", i ); // exit( 1 ); // } // if ( c47_3.d[ i ] != c47_3.d[ i ] ) { // printf( "error gemv Nan: c47_3[ %d ]\n", i ); // exit( 1 ); // } //} //for ( i = 0; i < 4; i ++ ) { // if ( w[ i ] != w[ i ] ) { // printf( "GSKS error w Nan: w03[ %d ]\n", i ); // } //} //for ( i = 0; i < 4; i++ ) { // if ( u03.d[ i ] != u03.d[ i ] ) { // printf( "GSKS error u Nan: u03[ %d ]\n", i ); // } // if ( u47.d[ i ] != u47.d[ i ] ) { // printf( "GSKS error u Nan: u47[ %d ]\n", i ); // } //} //printf( "%lf\n", u03.d[0] ); //printf( "%lf\n", u03.d[1] ); //printf( "%lf\n", u03.d[2] ); //printf( "%lf\n", u03.d[3] ); //printf( "%lf\n", u47.d[0] ); //printf( "%lf\n", u47.d[1] ); //printf( "%lf\n", u47.d[2] ); //printf( "%lf\n", u47.d[3] ); }