static void NOINLINE transposeX8( const __m256 *v1, __m256 *vout ) { #if 0 // AVX1 __m256 a0 = _mm256_unpacklo_ps( v1[ 0 ], v1[ 1 ] ); __m256 a1 = _mm256_unpackhi_ps( v1[ 0 ], v1[ 1 ] ); __m256 b0 = _mm256_permute2f128_ps( a0, a1, _MM_SHUFFLE( 0, 2, 0, 0 ) ); __m256 b1 = _mm256_permute2f128_ps( a0, a1, _MM_SHUFFLE( 0, 3, 0, 1 ) ); __m256 c0 = _mm256_unpacklo_ps( b0, b1 ); __m256 c1 = _mm256_unpackhi_ps( b0, b1 ); vout[ 0 ] = _mm256_permute2f128_ps( c0, c1, _MM_SHUFFLE( 0, 2, 0, 0 ) ); vout[ 1 ] = _mm256_permute2f128_ps( c0, c1, _MM_SHUFFLE( 0, 3, 0, 1 ) ); #else // AVX2 static const int ALIGN32 p1[ 8 ] = { 0, 4, 2, 6, 1, 5, 3, 7 }; static const int ALIGN32 p2[ 8 ] = { 2, 6, 0, 4, 3, 7, 1, 5 }; const __m256i perm1 = _mm256_load_si256( reinterpret_cast< const __m256i* >( p1 ) ); const __m256i perm2 = _mm256_load_si256( reinterpret_cast< const __m256i* >( p2 ) ); __m256 a0 = _mm256_permutevar8x32_ps( v1[ 0 ], perm1 ); __m256 a1 = _mm256_permutevar8x32_ps( v1[ 1 ], perm2 ); vout[ 0 ] = _mm256_blend_ps( a0, a1, 0xCC ); vout[ 1 ] = _mm256_shuffle_ps( a0, a1, 0x4E ); #endif }
void test8bit (void) { i1 = _mm_cmpistrm (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistri (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistra (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistrc (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistro (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistrs (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistrz (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ i1 = _mm_cmpestrm (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestri (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestra (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestrc (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestro (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestrs (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestrz (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ b1 = _mm256_blend_ps (b2, b3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ k1 = _cvtss_sh (f1, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm256_cvtps_ph (b2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ b1 = _mm256_dp_ps (b2, b3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ e1 = _mm256_permute2f128_pd (e2, e3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */ b1 = _mm256_permute2f128_ps (b2, b3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */ l1 = _mm256_permute2f128_si256 (l2, l3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */ b1 = _mm256_permute_ps (b2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_aeskeygenassist_si128 (i2, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_blend_epi16 (i2, i3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_clmulepi64_si128 (i2, i3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_cvtps_ph (a1, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ d1 = _mm_dp_pd (d2, d3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ a1 = _mm_dp_ps (a2, a3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ a1 = _mm_insert_ps (a2, a3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_mpsadbw_epu8 (i2, i3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ a1 = _mm_permute_ps (a2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_slli_si128 (i2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_srli_si128 (i2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ }
void kernel_strmv_u_n_8_lib8(int kmax, float *A, float *x, float *y, int alg) { if(kmax<=0) return; const int lda = 8; __builtin_prefetch( A + 0*lda ); __builtin_prefetch( A + 2*lda ); int k; __m256 zeros, ax_temp, a_00, a_01, a_02, a_03, x_0, x_1, x_2, x_3, y_0, y_0_b, y_0_c, y_0_d, z_0; zeros = _mm256_setzero_ps(); y_0 = _mm256_setzero_ps(); y_0_b = _mm256_setzero_ps(); y_0_c = _mm256_setzero_ps(); y_0_d = _mm256_setzero_ps(); __builtin_prefetch( A + 4*lda ); __builtin_prefetch( A + 6*lda ); a_00 = _mm256_load_ps( &A[0+lda*0] ); x_0 = _mm256_broadcast_ss( &x[0] ); x_0 = _mm256_blend_ps( zeros, x_0, 0x01 ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); x_1 = _mm256_broadcast_ss( &x[1] ); x_1 = _mm256_blend_ps( zeros, x_1, 0x03 ); ax_temp = _mm256_mul_ps( a_01, x_1 ); y_0_b = _mm256_add_ps( y_0_b, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); x_2 = _mm256_broadcast_ss( &x[2] ); x_2 = _mm256_blend_ps( zeros, x_2, 0x07 ); ax_temp = _mm256_mul_ps( a_02, x_2 ); y_0_c = _mm256_add_ps( y_0_c, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); x_3 = _mm256_broadcast_ss( &x[3] ); x_3 = _mm256_blend_ps( zeros, x_3, 0x0f ); ax_temp = _mm256_mul_ps( a_03, x_3 ); y_0_d = _mm256_add_ps( y_0_d, ax_temp ); A += 4*lda; x += 4; __builtin_prefetch( A + 4*lda ); __builtin_prefetch( A + 6*lda ); a_00 = _mm256_load_ps( &A[0+lda*0] ); x_0 = _mm256_broadcast_ss( &x[0] ); x_0 = _mm256_blend_ps( zeros, x_0, 0x1f ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); x_1 = _mm256_broadcast_ss( &x[1] ); x_1 = _mm256_blend_ps( zeros, x_1, 0x3f ); ax_temp = _mm256_mul_ps( a_01, x_1 ); y_0_b = _mm256_add_ps( y_0_b, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); x_2 = _mm256_broadcast_ss( &x[2] ); x_2 = _mm256_blend_ps( zeros, x_2, 0x7f ); ax_temp = _mm256_mul_ps( a_02, x_2 ); y_0_c = _mm256_add_ps( y_0_c, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); x_3 = _mm256_broadcast_ss( &x[3] ); ax_temp = _mm256_mul_ps( a_03, x_3 ); y_0_d = _mm256_add_ps( y_0_d, ax_temp ); A += 4*lda; x += 4; k=8; for(; k<kmax-7; k+=8) { __builtin_prefetch( A + 4*lda ); __builtin_prefetch( A + 6*lda ); a_00 = _mm256_load_ps( &A[0+lda*0] ); x_0 = _mm256_broadcast_ss( &x[0] ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); x_1 = _mm256_broadcast_ss( &x[1] ); ax_temp = _mm256_mul_ps( a_01, x_1 ); y_0_b = _mm256_add_ps( y_0_b, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); x_2 = _mm256_broadcast_ss( &x[2] ); ax_temp = _mm256_mul_ps( a_02, x_2 ); y_0_c = _mm256_add_ps( y_0_c, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); x_3 = _mm256_broadcast_ss( &x[3] ); ax_temp = _mm256_mul_ps( a_03, x_3 ); y_0_d = _mm256_add_ps( y_0_d, ax_temp ); A += 4*lda; x += 4; __builtin_prefetch( A + 4*lda ); __builtin_prefetch( A + 6*lda ); a_00 = _mm256_load_ps( &A[0+lda*0] ); x_0 = _mm256_broadcast_ss( &x[0] ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); x_1 = _mm256_broadcast_ss( &x[1] ); ax_temp = _mm256_mul_ps( a_01, x_1 ); y_0_b = _mm256_add_ps( y_0_b, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); x_2 = _mm256_broadcast_ss( &x[2] ); ax_temp = _mm256_mul_ps( a_02, x_2 ); y_0_c = _mm256_add_ps( y_0_c, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); x_3 = _mm256_broadcast_ss( &x[3] ); ax_temp = _mm256_mul_ps( a_03, x_3 ); y_0_d = _mm256_add_ps( y_0_d, ax_temp ); A += 4*lda; x += 4; } for(; k<kmax-3; k+=4) { __builtin_prefetch( A + 4*lda ); __builtin_prefetch( A + 6*lda ); a_00 = _mm256_load_ps( &A[0+lda*0] ); x_0 = _mm256_broadcast_ss( &x[0] ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); x_1 = _mm256_broadcast_ss( &x[1] ); ax_temp = _mm256_mul_ps( a_01, x_1 ); y_0_b = _mm256_add_ps( y_0_b, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); x_2 = _mm256_broadcast_ss( &x[2] ); ax_temp = _mm256_mul_ps( a_02, x_2 ); y_0_c = _mm256_add_ps( y_0_c, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); x_3 = _mm256_broadcast_ss( &x[3] ); ax_temp = _mm256_mul_ps( a_03, x_3 ); y_0_d = _mm256_add_ps( y_0_d, ax_temp ); A += 4*lda; x += 4; } y_0 = _mm256_add_ps( y_0 , y_0_c ); y_0_b = _mm256_add_ps( y_0_b, y_0_d ); if(kmax%4>=2) { a_00 = _mm256_load_ps( &A[0+lda*0] ); x_0 = _mm256_broadcast_ss( &x[0] ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); x_1 = _mm256_broadcast_ss( &x[1] ); ax_temp = _mm256_mul_ps( a_01, x_1 ); y_0_b = _mm256_add_ps( y_0_b, ax_temp ); A += 2*lda; x += 2; } y_0 = _mm256_add_ps( y_0 , y_0_b ); if(kmax%2==1) { a_00 = _mm256_load_ps( &A[0+lda*0] ); x_0 = _mm256_broadcast_ss( &x[0] ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); /* A += 1*lda;*/ /* x += 1;*/ } if(alg==0) { _mm256_storeu_ps(&y[0], y_0); } else if(alg==1) { z_0 = _mm256_loadu_ps( &y[0] ); z_0 = _mm256_add_ps( z_0, y_0 ); _mm256_storeu_ps(&y[0], z_0); } else // alg==-1 { z_0 = _mm256_loadu_ps( &y[0] ); z_0 = _mm256_sub_ps( z_0, y_0 ); _mm256_storeu_ps(&y[0], z_0); } }
void kernel_strmv_u_t_8_lib8(int kmax, float *A, int sda, float *x, float *y, int alg) { /* if(kmax<=0) */ /* return;*/ const int lda = 8; /* const int bs = 8;*/ __builtin_prefetch( A + 0*lda ); __builtin_prefetch( A + 2*lda ); __builtin_prefetch( A + 4*lda ); __builtin_prefetch( A + 6*lda ); int k; __m256 zeros, ax_temp, a_00, a_01, a_02, a_03, x_0, y_0, y_1, y_2, y_3, y_4, y_5, y_6, y_7; zeros = _mm256_setzero_ps(); y_0 = _mm256_setzero_ps(); y_1 = _mm256_setzero_ps(); y_2 = _mm256_setzero_ps(); y_3 = _mm256_setzero_ps(); y_4 = _mm256_setzero_ps(); y_5 = _mm256_setzero_ps(); y_6 = _mm256_setzero_ps(); y_7 = _mm256_setzero_ps(); k=0; for(; k<kmax-7; k+=8) { x_0 = _mm256_loadu_ps( &x[0] ); __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); a_00 = _mm256_load_ps( &A[0+lda*0] ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); ax_temp = _mm256_mul_ps( a_01, x_0 ); y_1 = _mm256_add_ps( y_1, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); ax_temp = _mm256_mul_ps( a_02, x_0 ); y_2 = _mm256_add_ps( y_2, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); ax_temp = _mm256_mul_ps( a_03, x_0 ); y_3 = _mm256_add_ps( y_3, ax_temp ); __builtin_prefetch( A + sda*lda + 4*lda ); __builtin_prefetch( A + sda*lda + 6*lda ); a_00 = _mm256_load_ps( &A[0+lda*4] ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_4 = _mm256_add_ps( y_4, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*5] ); ax_temp = _mm256_mul_ps( a_01, x_0 ); y_5 = _mm256_add_ps( y_5, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*6] ); ax_temp = _mm256_mul_ps( a_02, x_0 ); y_6 = _mm256_add_ps( y_6, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*7] ); ax_temp = _mm256_mul_ps( a_03, x_0 ); y_7 = _mm256_add_ps( y_7, ax_temp ); A += sda*lda; x += lda; } x_0 = _mm256_loadu_ps( &x[0] ); a_00 = _mm256_load_ps( &A[0+lda*0] ); a_00 = _mm256_blend_ps( zeros, a_00, 0x01 ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_0 = _mm256_add_ps( y_0, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); a_01 = _mm256_blend_ps( zeros, a_01, 0x03 ); ax_temp = _mm256_mul_ps( a_01, x_0 ); y_1 = _mm256_add_ps( y_1, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); a_02 = _mm256_blend_ps( zeros, a_02, 0x07 ); ax_temp = _mm256_mul_ps( a_02, x_0 ); y_2 = _mm256_add_ps( y_2, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); a_03 = _mm256_blend_ps( zeros, a_03, 0x0f ); ax_temp = _mm256_mul_ps( a_03, x_0 ); y_3 = _mm256_add_ps( y_3, ax_temp ); a_00 = _mm256_load_ps( &A[0+lda*4] ); a_00 = _mm256_blend_ps( zeros, a_00, 0x1f ); ax_temp = _mm256_mul_ps( a_00, x_0 ); y_4 = _mm256_add_ps( y_4, ax_temp ); a_01 = _mm256_load_ps( &A[0+lda*5] ); a_01 = _mm256_blend_ps( zeros, a_01, 0x3f ); ax_temp = _mm256_mul_ps( a_01, x_0 ); y_5 = _mm256_add_ps( y_5, ax_temp ); a_02 = _mm256_load_ps( &A[0+lda*6] ); a_02 = _mm256_blend_ps( zeros, a_02, 0x7f ); ax_temp = _mm256_mul_ps( a_02, x_0 ); y_6 = _mm256_add_ps( y_6, ax_temp ); a_03 = _mm256_load_ps( &A[0+lda*7] ); /* a_03 = _mm256_blend_ps( zeros, a_03, 0xff );*/ ax_temp = _mm256_mul_ps( a_03, x_0 ); y_7 = _mm256_add_ps( y_7, ax_temp ); // reduction __m256 z_0; y_0 = _mm256_hadd_ps(y_0, y_1); y_2 = _mm256_hadd_ps(y_2, y_3); y_4 = _mm256_hadd_ps(y_4, y_5); y_6 = _mm256_hadd_ps(y_6, y_7); y_0 = _mm256_hadd_ps(y_0, y_2); y_4 = _mm256_hadd_ps(y_4, y_6); y_1 = _mm256_permute2f128_ps(y_0, y_4, 0x20); y_2 = _mm256_permute2f128_ps(y_0, y_4, 0x31); y_0 = _mm256_add_ps(y_1, y_2); // store if(alg==0) { _mm256_storeu_ps(&y[0], y_0); } else if(alg==1) { z_0 = _mm256_loadu_ps( &y[0] ); z_0 = _mm256_add_ps(z_0, y_0); _mm256_storeu_ps(&y[0], z_0); } else // alg==-1 { z_0 = _mm256_loadu_ps( &y[0] ); z_0 = _mm256_sub_ps(z_0, y_0); _mm256_storeu_ps(&y[0], z_0); } }
M_ALWAYS_INLINE static void bar(float (& input)[8]) { /* static constexpr uint_fast8_t idx[][2] = { {0, 1}, {3, 2}, {4, 5}, {7, 6}, // (1) {0, 2}, {1, 3}, {6, 4}, {7, 5}, // (2) {0, 1}, {2, 3}, {5, 4}, {7, 6}, // (3) {0, 4}, {1, 5}, {2, 6}, {3, 7}, // (4) {0, 2}, {1, 3}, {4, 6}, {5, 7}, // (5) {0, 1}, {2, 3}, {4, 5}, {6, 7} // (6) }; */ // Индекса трябва да представим в по удобен вид за // AVX инструкциите. Няма смисъл от цикъл и после развиване // защото (4)-тия случай е специален... По добре на ръка. static constexpr int blend_mask_1 =0b10011001; static constexpr int blend_mask_2=0b11000011; static constexpr int blend_mask_3 =0b10100101; static constexpr int blend_mask_4 =0b00001111; static constexpr int blend_mask_5=0b00110011; static constexpr int blend_mask_6=0b01010101; // Отговаря на (1), (3) и (6) static constexpr int permute_mask_1=0b10110001; // Отговаря на (2) и (5) static constexpr int permute_mask_2=0b01001110; __m256 result= _mm256_load_ps(input); // (1) __m256 mapped=_mm256_permute_ps(result,permute_mask_1); __m256 min=_mm256_min_ps(result,mapped); __m256 max=_mm256_max_ps(result,mapped); result=_mm256_blend_ps(max,min,blend_mask_1); // (2) mapped=_mm256_permute_ps(result,permute_mask_2); min=_mm256_min_ps(result,mapped); max=_mm256_max_ps(result,mapped); result=_mm256_blend_ps(max,min,blend_mask_2); // (3) mapped=_mm256_permute_ps(result,permute_mask_1); min=_mm256_min_ps(result,mapped); max=_mm256_max_ps(result,mapped); result=_mm256_blend_ps(max,min,blend_mask_3); // (4) Специалния случай тук трябва да пермутираме // между двете половини на YMM регистъра. mapped=_mm256_permute2f128_ps(result,result,1); min=_mm256_min_ps(result,mapped); max=_mm256_max_ps(result,mapped); result=_mm256_blend_ps(max,min,blend_mask_4); // (5) mapped=_mm256_permute_ps(result,permute_mask_2); min=_mm256_min_ps(result,mapped); max=_mm256_max_ps(result,mapped); result=_mm256_blend_ps(max,min,blend_mask_5); // (6) mapped=_mm256_permute_ps(result,permute_mask_1); min=_mm256_min_ps(result,mapped); max=_mm256_max_ps(result,mapped); result=_mm256_blend_ps(max,min,blend_mask_6); /**/ _mm256_store_ps(input,result); }
inline __m256 _mm256_broadcast_3_ss(__m256 a) { __m256 b = _mm256_permute_ps(a, _MM_SHUFFLE(3, 3, 3, 3)); return _mm256_blend_ps(b, _mm256_permute2f128_ps(b, b, 1), 0xF0); }
inline __m256 _mm256_broadcast_lo_ss(__m256 a) { __m256 b = _mm256_permute_ps(a, _MM_SHUFFLE(0, 0, 0, 0)); \ return _mm256_blend_ps(b, _mm256_permute2f128_ps(b, b, 1), 0xF0); \ }
void kernel_ssymv_4_lib8(int kmax, int kna, float *A, int sda, float *x_n, float *y_n, float *x_t, float *y_t, int tri, int alg) { if(kmax<=0) return; const int lda = 8; __builtin_prefetch( A + 0*lda ); __builtin_prefetch( A + 2*lda ); int k, k_left, ii; float k_left_d; const float mask_f[] = {7.5, 6.5, 5.5, 4.5, 3.5, 2.5, 1.5, 0.5}; float temp_space[8] = {}; __m256 mask, zeros, temp, a_00, a_01, a_02, a_03, x_n_0, x_n_1, x_n_2, x_n_3, y_n_0, x_t_0, y_t_0, y_t_1, y_t_2, y_t_3; mask = _mm256_loadu_ps( mask_f ); zeros = _mm256_setzero_ps(); x_n_0 = _mm256_broadcast_ss( &x_n[0] ); x_n_1 = _mm256_broadcast_ss( &x_n[1] ); x_n_2 = _mm256_broadcast_ss( &x_n[2] ); x_n_3 = _mm256_broadcast_ss( &x_n[3] ); if(alg==-1) // TODO xor { x_n_0 = _mm256_sub_ps( zeros, x_n_0 ); x_n_1 = _mm256_sub_ps( zeros, x_n_1 ); x_n_2 = _mm256_sub_ps( zeros, x_n_2 ); x_n_3 = _mm256_sub_ps( zeros, x_n_3 ); } y_t_0 = _mm256_setzero_ps(); y_t_1 = _mm256_setzero_ps(); y_t_2 = _mm256_setzero_ps(); y_t_3 = _mm256_setzero_ps(); k=0; // corner if(tri==1) { k_left = kna-k; k_left_d = 8.0 - k_left; /*printf("\nk_left = %d\n", k_left);*/ /* y_n_0 = _mm_load_ps( &y_n[0] );*/ /* y_n_0 = _mm_setzero_ps();*/ x_t_0 = _mm256_loadu_ps( &x_t[0] ); x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); /*_mm256_storeu_ps( temp_space, x_t_0 ); */ /*printf("\nx = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/ /*exit(1);*/ a_00 = _mm256_loadu_ps( &A[0+lda*0] ); a_00 = _mm256_blend_ps( a_00, zeros, 0x00 ); /* temp = _mm256_mul_ps( a_00, x_n_0 );*/ /* y_n_0 = _mm256_add_ps( y_n_0, temp );*/ y_n_0 = _mm256_mul_ps( a_00, x_n_0 ); a_00 = _mm256_blend_ps( a_00, zeros, 0x01 ); temp = _mm256_mul_ps( a_00, x_t_0 ); y_t_0 = _mm256_add_ps( y_t_0, temp ); a_01 = _mm256_loadu_ps( &A[0+lda*1] ); a_01 = _mm256_blend_ps( a_01, zeros, 0x01 ); temp = _mm256_mul_ps( a_01, x_n_1 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); a_01 = _mm256_blend_ps( a_01, zeros, 0x03 ); temp = _mm256_mul_ps( a_01, x_t_0 ); y_t_1 = _mm256_add_ps( y_t_1, temp ); a_02 = _mm256_loadu_ps( &A[0+lda*2] ); a_02 = _mm256_blend_ps( a_02, zeros, 0x03 ); temp = _mm256_mul_ps( a_02, x_n_2 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); a_02 = _mm256_blend_ps( a_02, zeros, 0x07 ); temp = _mm256_mul_ps( a_02, x_t_0 ); y_t_2 = _mm256_add_ps( y_t_2, temp ); a_03 = _mm256_loadu_ps( &A[0+lda*3] ); a_03 = _mm256_blend_ps( a_03, zeros, 0x07 ); temp = _mm256_mul_ps( a_03, x_n_3 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); a_03 = _mm256_blend_ps( a_03, zeros, 0x0f ); temp = _mm256_mul_ps( a_03, x_t_0 ); y_t_3 = _mm256_add_ps( y_t_3, temp ); /*_mm256_storeu_ps( temp_space, y_n_0 ); */ /*printf("\ny = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/ y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); /*_mm256_storeu_ps( temp_space, y_n_0 ); */ /*printf("\ny = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/ x_t_0 = _mm256_loadu_ps( &y_n[0] ); y_n_0 = _mm256_add_ps( y_n_0, x_t_0 ); _mm256_storeu_ps( &y_n[0], y_n_0 ); A += k_left; y_n += k_left; x_t += k_left; k += k_left; } if(k<kna) // it can be only k_left = {1, 2, 3, 4, 5, 6, 7} /* for(; k<kna; k++)*/ { k_left = kna-k; k_left_d = 8.0 - k_left; /*printf("\nk_left = %d\n", k_left);*/ /* y_n_0 = _mm_load_ps( &y_n[0] );*/ /* y_n_0 = _mm_setzero_ps();*/ x_t_0 = _mm256_loadu_ps( &x_t[0] ); x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); /*_mm256_storeu_ps( temp_space, x_t_0 ); */ /*printf("\nx = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/ a_00 = _mm256_loadu_ps( &A[0+lda*0] ); a_01 = _mm256_loadu_ps( &A[0+lda*1] ); a_02 = _mm256_loadu_ps( &A[0+lda*2] ); a_03 = _mm256_loadu_ps( &A[0+lda*3] ); /* temp = _mm256_mul_ps( a_00, x_n_0 );*/ /* y_n_0 = _mm256_add_ps( y_n_0, temp );*/ y_n_0 = _mm256_mul_ps( a_00, x_n_0 ); temp = _mm256_mul_ps( a_00, x_t_0 ); y_t_0 = _mm256_add_ps( y_t_0, temp ); temp = _mm256_mul_ps( a_01, x_n_1 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_01, x_t_0 ); y_t_1 = _mm256_add_ps( y_t_1, temp ); temp = _mm256_mul_ps( a_02, x_n_2 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_02, x_t_0 ); y_t_2 = _mm256_add_ps( y_t_2, temp ); temp = _mm256_mul_ps( a_03, x_n_3 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_03, x_t_0 ); y_t_3 = _mm256_add_ps( y_t_3, temp ); y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); x_t_0 = _mm256_loadu_ps( &y_n[0] ); y_n_0 = _mm256_add_ps( y_n_0, x_t_0 ); _mm256_storeu_ps( &y_n[0], y_n_0 ); /* _mm256_maskstore_ps( &y_n[0], (__m256i) _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ), y_n_0 );*/ /* _mm256_storeu_ps( temp_space, y_n_0 );*/ /* for(ii=0; ii<k_left; ii++)*/ /* y_n[ii] = temp_space[ii];*/ /*printf("\nk_left = %d\n", k_left);*/ /*exit(1);*/ A += k_left; y_n += k_left; x_t += k_left; k += k_left; } if(kna>0 || tri==1) { A += (sda-1)*lda; } for(; k<kmax-7; k+=8) { __builtin_prefetch( A + sda*lda + 0*lda ); __builtin_prefetch( A + sda*lda + 2*lda ); y_n_0 = _mm256_loadu_ps( &y_n[0] ); x_t_0 = _mm256_loadu_ps( &x_t[0] ); a_00 = _mm256_load_ps( &A[0+lda*0] ); temp = _mm256_mul_ps( a_00, x_n_0 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_00, x_t_0 ); y_t_0 = _mm256_add_ps( y_t_0, temp ); a_01 = _mm256_load_ps( &A[0+lda*1] ); temp = _mm256_mul_ps( a_01, x_n_1 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_01, x_t_0 ); y_t_1 = _mm256_add_ps( y_t_1, temp ); a_02 = _mm256_load_ps( &A[0+lda*2] ); temp = _mm256_mul_ps( a_02, x_n_2 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_02, x_t_0 ); y_t_2 = _mm256_add_ps( y_t_2, temp ); a_03 = _mm256_load_ps( &A[0+lda*3] ); temp = _mm256_mul_ps( a_03, x_n_3 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_03, x_t_0 ); y_t_3 = _mm256_add_ps( y_t_3, temp ); _mm256_storeu_ps( &y_n[0], y_n_0 ); A += sda*lda; y_n += 8; x_t += 8; } if(k<kmax) // it can be only k_left = {1, 2, 3, 4, 5, 6, 7} { k_left = kmax-k; k_left_d = 8.0 - k_left; /* y_n_0 = _mm_load_ps( &y_n[0] );*/ /* y_n_0 = _mm_setzero_ps();*/ x_t_0 = _mm256_loadu_ps( &x_t[0] ); x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); /*printf("\nk_left2 = %d\n", k_left, kmax, k);*/ a_00 = _mm256_load_ps( &A[0+lda*0] ); /*printf("\nk_left2 = %d\n", k_left);*/ a_01 = _mm256_load_ps( &A[0+lda*1] ); a_02 = _mm256_load_ps( &A[0+lda*2] ); a_03 = _mm256_load_ps( &A[0+lda*3] ); /* temp = _mm256_mul_ps( a_00, x_n_0 );*/ /* y_n_0 = _mm256_add_ps( y_n_0, temp );*/ y_n_0 = _mm256_mul_ps( a_00, x_n_0 ); temp = _mm256_mul_ps( a_00, x_t_0 ); y_t_0 = _mm256_add_ps( y_t_0, temp ); temp = _mm256_mul_ps( a_01, x_n_1 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_01, x_t_0 ); y_t_1 = _mm256_add_ps( y_t_1, temp ); temp = _mm256_mul_ps( a_02, x_n_2 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_02, x_t_0 ); y_t_2 = _mm256_add_ps( y_t_2, temp ); temp = _mm256_mul_ps( a_03, x_n_3 ); y_n_0 = _mm256_add_ps( y_n_0, temp ); temp = _mm256_mul_ps( a_03, x_t_0 ); y_t_3 = _mm256_add_ps( y_t_3, temp ); y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) ); x_t_0 = _mm256_loadu_ps( &y_n[0] ); y_n_0 = _mm256_add_ps( y_n_0, x_t_0 ); _mm256_storeu_ps( &y_n[0], y_n_0 ); /* _mm256_maskstore_ps( &y_n[0], (__m256i) _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ), y_n_0 );*/ /* _mm256_storeu_ps( temp_space, y_n_0 );*/ /* for(ii=0; ii<k_left; ii++)*/ /* y_n[ii] = temp_space[ii];*/ /* A += 1;*/ /* y_n += 1;*/ /* x_t += 1;*/ } // reduction __m128 z_0, z_1; y_t_0 = _mm256_hadd_ps(y_t_0, y_t_1); y_t_2 = _mm256_hadd_ps(y_t_2, y_t_3); y_t_0 = _mm256_hadd_ps(y_t_0, y_t_2); y_t_1 = _mm256_permute2f128_ps(y_t_0, y_t_0, 0x01); z_0 = _mm256_castps256_ps128(y_t_0); z_1 = _mm256_castps256_ps128(y_t_1); z_1 = _mm_add_ps(z_0, z_1); if(alg==1) { z_0 = _mm_loadu_ps( &y_t[0] ); z_0 = _mm_add_ps(z_0, z_1); _mm_storeu_ps( &y_t[0], z_0 ); } else // alg==-1 { z_0 = _mm_loadu_ps( &y_t[0] ); z_0 = _mm_sub_ps(z_0, z_1); _mm_storeu_ps( &y_t[0], z_0 ); } }