__m128 test_mm_broadcast_ss(float const *__a) { // CHECK-LABEL: @test_mm_broadcast_ss // CHECK: insertelement <4 x float> {{.*}}, i32 0 // CHECK: insertelement <4 x float> {{.*}}, i32 1 // CHECK: insertelement <4 x float> {{.*}}, i32 2 // CHECK: insertelement <4 x float> {{.*}}, i32 3 return _mm_broadcast_ss(__a); }
void static avx_test (void) { int i; float s = 39678.3452; union128 u; float e [4]; u.x = _mm_broadcast_ss (&s); for (i = 0; i < 4; i++) e[i] = s; if (check_union128 (u, e)) abort (); }
float tricub_x86_f(float *src, float *abcd, float x, float y){ float *s; float x0, x1, x2, x3, y0, y1, y2, y3; float dst[4]; #if defined(__AVX2__) && defined(__x86_64__) __m256 v1, v2, v3, v4; __m256 va, vb, vc, vd; __m128 va4, vb4, vc4, vd4; __m128 v128a, v128b; __m128 vy0, vy1, vy2, vy3; #else int i, ni2, ni3, ninj2, ninj3; float va4[4], vb4[4], vc4[4], vd4[4]; ninj2 = ninj + ninj; ninj3 = ninj2 + ninj; ni2 = ni + ni; ni3 = ni2 + ni; #endif #if defined(__AVX2__) && defined(__x86_64__) // ==== interpolation along Z, vector length is 16 (2 vectors of length 8 per plane) ==== va = _mm256_broadcast_ss(abcd); // promote constants to vectors vb = _mm256_broadcast_ss(abcd+1); vc = _mm256_broadcast_ss(abcd+2); vd = _mm256_broadcast_ss(abcd+3); s = src; // rows 0 and 1, 4 planes (Z0, Z1, Z2, Z3) v128a = _mm_loadu_ps(s); // Z0 row 0 v1 = _mm256_insertf128_ps(v1,v128a,0); v128b = _mm_loadu_ps(s+ni); // Z0 row 1 v1 = _mm256_insertf128_ps(v1,v128b,1); v1 = _mm256_mul_ps(v1,va); // v1 = v1*va s += ninj; v128a = _mm_loadu_ps(s); // Z1 row 0 v2 = _mm256_insertf128_ps(v2,v128a,0); v128b = _mm_loadu_ps(s+ni); // Z1 row 1 v2 = _mm256_insertf128_ps(v2,v128b,1); v1 = _mm256_fmadd_ps(v2,vb,v1); // v1 += v2*vb s += ninj; v128a = _mm_loadu_ps(s); // Z2 row 0 v3 = _mm256_insertf128_ps(v3,v128a,0); v128b = _mm_loadu_ps(s+ni); // Z2 row 1 v3 = _mm256_insertf128_ps(v3,v128b,1); v1 = _mm256_fmadd_ps(v3,vc,v1); // v1 += v3*vc s += ninj; v128a = _mm_loadu_ps(s); // Z3 row 0 v4 = _mm256_insertf128_ps(v4,v128a,0); v128b = _mm_loadu_ps(s+ni); // Z3 row 1 v4 = _mm256_insertf128_ps(v4,v128b,1); v1 = _mm256_fmadd_ps(v4,vd,v1); // v1 += v4*vd // split vector of length 8 into 2 vectors of length 4 vy0 = _mm256_extractf128_ps(v1,0);// Y0 : row 0 (v1 low) vy1 = _mm256_extractf128_ps(v1,1);// Y1 : row 1 (v1 high) s = src + 2*ni; // rows 2 and 3, 4 planes (Z0, Z1, Z2, Z3) v128a = _mm_loadu_ps(s); // Z0 row 2 v1 = _mm256_insertf128_ps(v1,v128a,0); v128b = _mm_loadu_ps(s+ni); // Z0 row 3 v1 = _mm256_insertf128_ps(v1,v128b,1); v1 = _mm256_mul_ps(v1,va); // v1 = v1*va s += ninj; v128a = _mm_loadu_ps(s); // Z1 row 2 v2 = _mm256_insertf128_ps(v2,v128a,0); v128b = _mm_loadu_ps(s+ni); // Z1 row 3 v2 = _mm256_insertf128_ps(v2,v128b,1); v1 = _mm256_fmadd_ps(v2,vb,v1); // v1 += v2*vb s += ninj; v128a = _mm_loadu_ps(s); // Z2 row 2 v3 = _mm256_insertf128_ps(v3,v128a,0); v128b = _mm_loadu_ps(s+ni); // Z2 row 3 v3 = _mm256_insertf128_ps(v3,v128b,1); v1 = _mm256_fmadd_ps(v3,vc,v1); // v1 += v3*vc s += ninj; v128a = _mm_loadu_ps(s); // Z3 row 2 v4 = _mm256_insertf128_ps(v4,v128a,0); v128b = _mm_loadu_ps(s+ni); // Z3 row 3 v4 = _mm256_insertf128_ps(v4,v128b,1); v1 = _mm256_fmadd_ps(v4,vd,v1); // v1 += v4*vd // split vector of length 8 into 2 vectors of length 4 vy2 = _mm256_extractf128_ps(v1,0);// Y2 : row 2 (v1 low) vy3 = _mm256_extractf128_ps(v1,1);// Y3 : row 3 (v1 high) // ==== interpolation along Y, vector length is 4 (4 rows) ==== y0 = cm167*y*(y-one)*(y-two); y1 = cp5*(y+one)*(y-one)*(y-two); y2 = cm5*y*(y+one)*(y-two); y3 = cp167*y*(y+one)*(y-one); va4 = _mm_broadcast_ss(&y0); // promote constants to vectors vb4 = _mm_broadcast_ss(&y1); vc4 = _mm_broadcast_ss(&y2); vd4 = _mm_broadcast_ss(&y3); vy0 = _mm_mul_ps(vy0,va4); // vy0 * va4 vy0 = _mm_fmadd_ps(vy1,vb4,vy0); // += vy1 * vb4 vy0 = _mm_fmadd_ps(vy2,vc4,vy0); // += vy2 * vc4 vy0 = _mm_fmadd_ps(vy3,vd4,vy0); // += vy3 * vd4 _mm_storeu_ps(dst,vy0); // store 4 values along X #else y0 = cm167*y*(y-one)*(y-two); y1 = cp5*(y+one)*(y-one)*(y-two); y2 = cm5*y*(y+one)*(y-two); y3 = cp167*y*(y+one)*(y-one); for (i=0 ; i<4 ; i++){ va4[i] = src[i ]*abcd[0] + src[i +ninj]*abcd[1] + src[i +ninj2]*abcd[2] + src[i +ninj3]*abcd[3]; vb4[i] = src[i+ni ]*abcd[0] + src[i+ni +ninj]*abcd[1] + src[i+ni +ninj2]*abcd[2] + src[i+ni +ninj3]*abcd[3]; vc4[i] = src[i+ni2]*abcd[0] + src[i+ni2+ninj]*abcd[1] + src[i+ni2+ninj2]*abcd[2] + src[i+ni2+ninj3]*abcd[3]; vd4[i] = src[i+ni3]*abcd[0] + src[i+ni3+ninj]*abcd[1] + src[i+ni3+ninj2]*abcd[2] + src[i+ni3+ninj3]*abcd[3]; dst[i] = va4[i]*y0 + vb4[i]*y1 + vc4[i]*y2 + vd4[i]*y3; } #endif // ==== interpolation along x, scalar ==== x0 = cm167*x*(x-one)*(x-two); x1 = cp5*(x+one)*(x-one)*(x-two); x2 = cm5*x*(x+one)*(x-two); x3 = cp167*x*(x+one)*(x-one); return(dst[0]*x0 + dst[1]*x1 + dst[2]*x2 + dst[3]*x3); }
void kernel_ssymv_4_lib8_old(int kmax, int kna, float *A, int sda, float *x_n, float *y_n, float *x_t, float *y_t, int tri, int alg) { if(kmax<=0) return; const int lda = 8; int k; __m128 zeros, temp, a_00, a_01, a_02, a_03, x_n_0, x_n_1, x_n_2, x_n_3, y_n_0, x_t_0, y_t_0, y_t_1, y_t_2, y_t_3; zeros = _mm_setzero_ps(); x_n_0 = _mm_broadcast_ss( &x_n[0] ); x_n_1 = _mm_broadcast_ss( &x_n[1] ); x_n_2 = _mm_broadcast_ss( &x_n[2] ); x_n_3 = _mm_broadcast_ss( &x_n[3] ); if(alg==-1) { x_n_0 = _mm_sub_ps( zeros, x_n_0 ); x_n_1 = _mm_sub_ps( zeros, x_n_1 ); x_n_2 = _mm_sub_ps( zeros, x_n_2 ); x_n_3 = _mm_sub_ps( zeros, x_n_3 ); } y_t_0 = _mm_setzero_ps(); y_t_1 = _mm_setzero_ps(); y_t_2 = _mm_setzero_ps(); y_t_3 = _mm_setzero_ps(); k=0; // corner if(tri==1) { y_n_0 = _mm_load_ss( &y_n[0] ); x_t_0 = _mm_load_ss( &x_t[0] ); a_00 = _mm_load_ss( &A[0+lda*0] ); a_01 = _mm_load_ss( &A[0+lda*1] ); a_02 = _mm_load_ss( &A[0+lda*2] ); a_03 = _mm_load_ss( &A[0+lda*3] ); /* temp = _mm_mul_ss( a_00, x_n_0 );*/ /* y_n_0 = _mm_add_ss( y_n_0, temp );*/ temp = _mm_mul_ss( a_00, x_t_0 ); y_t_0 = _mm_add_ss( y_t_0, temp ); temp = _mm_mul_ss( a_01, x_n_1 ); y_n_0 = _mm_add_ss( y_n_0, temp ); temp = _mm_mul_ss( a_01, x_t_0 ); y_t_1 = _mm_add_ss( y_t_1, temp ); temp = _mm_mul_ss( a_02, x_n_2 ); y_n_0 = _mm_add_ss( y_n_0, temp ); temp = _mm_mul_ss( a_02, x_t_0 ); y_t_2 = _mm_add_ss( y_t_2, temp ); temp = _mm_mul_ss( a_03, x_n_3 ); y_n_0 = _mm_add_ss( y_n_0, temp ); temp = _mm_mul_ss( a_03, x_t_0 ); y_t_3 = _mm_add_ss( y_t_3, temp ); _mm_store_ss( &y_n[0], y_n_0 ); y_n_0 = _mm_load_ss( &y_n[1] ); x_t_0 = _mm_load_ss( &x_t[1] ); /* a_00 = _mm_load_ss( &A[1+lda*0] );*/ a_01 = _mm_load_ss( &A[1+lda*1] ); a_02 = _mm_load_ss( &A[1+lda*2] ); a_03 = _mm_load_ss( &A[1+lda*3] ); /* temp = _mm_mul_ss( a_00, x_n_0 );*/ /* y_n_0 = _mm_add_ss( y_n_0, temp );*/ /* temp = _mm_mul_ss( a_00, x_t_0 );*/ /* y_t_0 = _mm_add_ss( y_t_0, temp );*/ /* temp = _mm_mul_ss( a_01, x_n_1 );*/ /* y_n_0 = _mm_add_ss( y_n_0, temp );*/ temp = _mm_mul_ss( a_01, x_t_0 ); y_t_1 = _mm_add_ss( y_t_1, temp ); temp = _mm_mul_ss( a_02, x_n_2 ); y_n_0 = _mm_add_ss( y_n_0, temp ); temp = _mm_mul_ss( a_02, x_t_0 ); y_t_2 = _mm_add_ss( y_t_2, temp ); temp = _mm_mul_ss( a_03, x_n_3 ); y_n_0 = _mm_add_ss( y_n_0, temp ); temp = _mm_mul_ss( a_03, x_t_0 ); y_t_3 = _mm_add_ss( y_t_3, temp ); _mm_store_ss( &y_n[1], y_n_0 ); y_n_0 = _mm_load_ss( &y_n[2] ); x_t_0 = _mm_load_ss( &x_t[2] ); /* a_00 = _mm_load_ss( &A[2+lda*0] );*/ /* a_01 = _mm_load_ss( &A[2+lda*1] );*/ a_02 = _mm_load_ss( &A[2+lda*2] ); a_03 = _mm_load_ss( &A[2+lda*3] ); /* temp = _mm_mul_ss( a_00, x_n_0 );*/ /* y_n_0 = _mm_add_ss( y_n_0, temp );*/ /* temp = _mm_mul_ss( a_00, x_t_0 );*/ /* y_t_0 = _mm_add_ss( y_t_0, temp );*/ /* temp = _mm_mul_ss( a_01, x_n_1 );*/ /* y_n_0 = _mm_add_ss( y_n_0, temp );*/ /* temp = _mm_mul_ss( a_01, x_t_0 );*/ /* y_t_1 = _mm_add_ss( y_t_1, temp );*/ /* temp = _mm_mul_ss( a_02, x_n_2 );*/ /* y_n_0 = _mm_add_ss( y_n_0, temp );*/ temp = _mm_mul_ss( a_02, x_t_0 ); y_t_2 = _mm_add_ss( y_t_2, temp ); temp = _mm_mul_ss( a_03, x_n_3 ); y_n_0 = _mm_add_ss( y_n_0, temp ); temp = _mm_mul_ss( a_03, x_t_0 ); y_t_3 = _mm_add_ss( y_t_3, temp ); _mm_store_ss( &y_n[2], y_n_0 ); y_n_0 = _mm_load_ss( &y_n[3] ); x_t_0 = _mm_load_ss( &x_t[3] ); /* a_00 = _mm_load_ss( &A[3+lda*0] );*/ /* a_01 = _mm_load_ss( &A[3+lda*1] );*/ /* a_02 = _mm_load_ss( &A[3+lda*2] );*/ a_03 = _mm_load_ss( &A[3+lda*3] ); /* temp = _mm_mul_ss( a_00, x_n_0 );*/ /* y_n_0 = _mm_add_ss( y_n_0, temp );*/ /* temp = _mm_mul_ss( a_00, x_t_0 );*/ /* y_t_0 = _mm_add_ss( y_t_0, temp );*/ /* temp = _mm_mul_ss( a_01, x_n_1 );*/ /* y_n_0 = _mm_add_ss( y_n_0, temp );*/ /* temp = _mm_mul_ss( a_01, x_t_0 );*/ /* y_t_1 = _mm_add_ss( y_t_1, temp );*/ /* temp = _mm_mul_ss( a_02, x_n_2 );*/ /* y_n_0 = _mm_add_ss( y_n_0, temp );*/ /* temp = _mm_mul_ss( a_02, x_t_0 );*/ /* y_t_2 = _mm_add_ss( y_t_2, temp );*/ /* temp = _mm_mul_ss( a_03, x_n_3 );*/ /* y_n_0 = _mm_add_ss( y_n_0, temp );*/ temp = _mm_mul_ss( a_03, x_t_0 ); y_t_3 = _mm_add_ss( y_t_3, temp ); _mm_store_ss( &y_n[3], y_n_0 ); A += 4; y_n += 4; x_t += 4; k += 4; } for(; k<kna; k++) { y_n_0 = _mm_load_ss( &y_n[0] ); x_t_0 = _mm_load_ss( &x_t[0] ); a_00 = _mm_load_ss( &A[0+lda*0] ); a_01 = _mm_load_ss( &A[0+lda*1] ); a_02 = _mm_load_ss( &A[0+lda*2] ); a_03 = _mm_load_ss( &A[0+lda*3] ); temp = _mm_mul_ss( a_00, x_n_0 ); y_n_0 = _mm_add_ss( y_n_0, temp ); temp = _mm_mul_ss( a_00, x_t_0 ); y_t_0 = _mm_add_ss( y_t_0, temp ); temp = _mm_mul_ss( a_01, x_n_1 ); y_n_0 = _mm_add_ss( y_n_0, temp ); temp = _mm_mul_ss( a_01, x_t_0 ); y_t_1 = _mm_add_ss( y_t_1, temp ); temp = _mm_mul_ss( a_02, x_n_2 ); y_n_0 = _mm_add_ss( y_n_0, temp ); temp = _mm_mul_ss( a_02, x_t_0 ); y_t_2 = _mm_add_ss( y_t_2, temp ); temp = _mm_mul_ss( a_03, x_n_3 ); y_n_0 = _mm_add_ss( y_n_0, temp ); temp = _mm_mul_ss( a_03, x_t_0 ); y_t_3 = _mm_add_ss( y_t_3, temp ); _mm_store_ss( &y_n[0], y_n_0 ); A += 1; y_n += 1; x_t += 1; } if(kna>0 || tri==1) { A += (sda-1)*lda; } for(; k<kmax-7; k+=8) { y_n_0 = _mm_loadu_ps( &y_n[0] ); x_t_0 = _mm_loadu_ps( &x_t[0] ); a_00 = _mm_load_ps( &A[0+lda*0] ); a_01 = _mm_load_ps( &A[0+lda*1] ); a_02 = _mm_load_ps( &A[0+lda*2] ); a_03 = _mm_load_ps( &A[0+lda*3] ); temp = _mm_mul_ps( a_00, x_n_0 ); y_n_0 = _mm_add_ps( y_n_0, temp ); temp = _mm_mul_ps( a_00, x_t_0 ); y_t_0 = _mm_add_ps( y_t_0, temp ); temp = _mm_mul_ps( a_01, x_n_1 ); y_n_0 = _mm_add_ps( y_n_0, temp ); temp = _mm_mul_ps( a_01, x_t_0 ); y_t_1 = _mm_add_ps( y_t_1, temp ); temp = _mm_mul_ps( a_02, x_n_2 ); y_n_0 = _mm_add_ps( y_n_0, temp ); temp = _mm_mul_ps( a_02, x_t_0 ); y_t_2 = _mm_add_ps( y_t_2, temp ); temp = _mm_mul_ps( a_03, x_n_3 ); y_n_0 = _mm_add_ps( y_n_0, temp ); temp = _mm_mul_ps( a_03, x_t_0 ); y_t_3 = _mm_add_ps( y_t_3, temp ); _mm_storeu_ps( &y_n[0], y_n_0 ); y_n_0 = _mm_loadu_ps( &y_n[4] ); x_t_0 = _mm_loadu_ps( &x_t[4] ); a_00 = _mm_load_ps( &A[4+lda*0] ); a_01 = _mm_load_ps( &A[4+lda*1] ); a_02 = _mm_load_ps( &A[4+lda*2] ); a_03 = _mm_load_ps( &A[4+lda*3] ); temp = _mm_mul_ps( a_00, x_n_0 ); y_n_0 = _mm_add_ps( y_n_0, temp ); temp = _mm_mul_ps( a_00, x_t_0 ); y_t_0 = _mm_add_ps( y_t_0, temp ); temp = _mm_mul_ps( a_01, x_n_1 ); y_n_0 = _mm_add_ps( y_n_0, temp ); temp = _mm_mul_ps( a_01, x_t_0 ); y_t_1 = _mm_add_ps( y_t_1, temp ); temp = _mm_mul_ps( a_02, x_n_2 ); y_n_0 = _mm_add_ps( y_n_0, temp ); temp = _mm_mul_ps( a_02, x_t_0 ); y_t_2 = _mm_add_ps( y_t_2, temp ); temp = _mm_mul_ps( a_03, x_n_3 ); y_n_0 = _mm_add_ps( y_n_0, temp ); temp = _mm_mul_ps( a_03, x_t_0 ); y_t_3 = _mm_add_ps( y_t_3, temp ); _mm_storeu_ps( &y_n[4], y_n_0 ); A += sda*lda; y_n += 8; x_t += 8; } for(; k<kmax; k++) { y_n_0 = _mm_load_ss( &y_n[0] ); x_t_0 = _mm_load_ss( &x_t[0] ); a_00 = _mm_load_ss( &A[0+lda*0] ); a_01 = _mm_load_ss( &A[0+lda*1] ); a_02 = _mm_load_ss( &A[0+lda*2] ); a_03 = _mm_load_ss( &A[0+lda*3] ); temp = _mm_mul_ss( a_00, x_n_0 ); y_n_0 = _mm_add_ss( y_n_0, temp ); temp = _mm_mul_ss( a_00, x_t_0 ); y_t_0 = _mm_add_ss( y_t_0, temp ); temp = _mm_mul_ss( a_01, x_n_1 ); y_n_0 = _mm_add_ss( y_n_0, temp ); temp = _mm_mul_ss( a_01, x_t_0 ); y_t_1 = _mm_add_ss( y_t_1, temp ); temp = _mm_mul_ss( a_02, x_n_2 ); y_n_0 = _mm_add_ss( y_n_0, temp ); temp = _mm_mul_ss( a_02, x_t_0 ); y_t_2 = _mm_add_ss( y_t_2, temp ); temp = _mm_mul_ss( a_03, x_n_3 ); y_n_0 = _mm_add_ss( y_n_0, temp ); temp = _mm_mul_ss( a_03, x_t_0 ); y_t_3 = _mm_add_ss( y_t_3, temp ); _mm_store_ss( &y_n[0], y_n_0 ); A += 1; y_n += 1; x_t += 1; } // reduction y_t_0 = _mm_hadd_ps(y_t_0, y_t_1); y_t_2 = _mm_hadd_ps(y_t_2, y_t_3); y_t_0 = _mm_hadd_ps(y_t_0, y_t_2); if(alg==1) { y_t_1 = _mm_loadu_ps( &y_t[0] ); y_t_1 = _mm_add_ps(y_t_1, y_t_0); _mm_storeu_ps(&y_t[0], y_t_1); } else // alg==-1 { y_t_1 = _mm_loadu_ps( &y_t[0] ); y_t_1 = _mm_sub_ps(y_t_1, y_t_0); _mm_storeu_ps(&y_t[0], y_t_1); } }