Exemplo n.º 1
0
__m128
test_mm_broadcast_ss(float const *__a) {
  // CHECK-LABEL: @test_mm_broadcast_ss
  // CHECK: insertelement <4 x float> {{.*}}, i32 0
  // CHECK: insertelement <4 x float> {{.*}}, i32 1
  // CHECK: insertelement <4 x float> {{.*}}, i32 2
  // CHECK: insertelement <4 x float> {{.*}}, i32 3
  return _mm_broadcast_ss(__a);
}
Exemplo n.º 2
0
void static
avx_test (void)
{
  int i;
  float s = 39678.3452;
  union128 u;
  float e [4];

  u.x = _mm_broadcast_ss (&s);

  for (i = 0; i < 4; i++)
    e[i] = s;

  if (check_union128 (u, e))
    abort ();
}
Exemplo n.º 3
0
float tricub_x86_f(float *src, float *abcd, float x, float y){
  float *s;
  float x0, x1, x2, x3, y0, y1, y2, y3;
  float dst[4];
#if defined(__AVX2__) && defined(__x86_64__)
  __m256 v1, v2, v3, v4;
  __m256 va, vb, vc, vd;
  __m128 va4, vb4, vc4, vd4;
  __m128 v128a, v128b;
  __m128 vy0, vy1, vy2, vy3;
#else
  int i, ni2, ni3, ninj2, ninj3;
  float va4[4], vb4[4], vc4[4], vd4[4];
  ninj2 = ninj + ninj;
  ninj3 = ninj2 + ninj;
  ni2 = ni + ni;
  ni3 = ni2 + ni;
#endif

#if defined(__AVX2__) && defined(__x86_64__)

// ==== interpolation along Z, vector length is 16 (2 vectors of length 8 per plane) ====

  va = _mm256_broadcast_ss(abcd);   // promote constants to vectors
  vb = _mm256_broadcast_ss(abcd+1);
  vc = _mm256_broadcast_ss(abcd+2);
  vd = _mm256_broadcast_ss(abcd+3);

  s = src;                          // rows 0 and 1, 4 planes (Z0, Z1, Z2, Z3)
  v128a = _mm_loadu_ps(s);          // Z0 row 0
  v1 = _mm256_insertf128_ps(v1,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z0 row 1
  v1 = _mm256_insertf128_ps(v1,v128b,1);
  v1 = _mm256_mul_ps(v1,va);        // v1 = v1*va

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z1 row 0
  v2 = _mm256_insertf128_ps(v2,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z1 row 1
  v2 = _mm256_insertf128_ps(v2,v128b,1);
  v1 = _mm256_fmadd_ps(v2,vb,v1);   // v1 += v2*vb

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z2 row 0
  v3 = _mm256_insertf128_ps(v3,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z2 row 1
  v3 = _mm256_insertf128_ps(v3,v128b,1);
  v1 = _mm256_fmadd_ps(v3,vc,v1);   // v1 += v3*vc

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z3 row 0
  v4 = _mm256_insertf128_ps(v4,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z3 row 1
  v4 = _mm256_insertf128_ps(v4,v128b,1);
  v1 = _mm256_fmadd_ps(v4,vd,v1);   // v1 += v4*vd
                                    // split vector of length 8 into 2 vectors of length 4
  vy0 = _mm256_extractf128_ps(v1,0);// Y0 : row 0 (v1 low)
  vy1 = _mm256_extractf128_ps(v1,1);// Y1 : row 1 (v1 high)

  s = src + 2*ni;                   // rows 2 and 3, 4 planes (Z0, Z1, Z2, Z3)
  v128a = _mm_loadu_ps(s);          // Z0 row 2
  v1 = _mm256_insertf128_ps(v1,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z0 row 3
  v1 = _mm256_insertf128_ps(v1,v128b,1);
  v1 = _mm256_mul_ps(v1,va);        // v1 = v1*va

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z1 row 2
  v2 = _mm256_insertf128_ps(v2,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z1 row 3
  v2 = _mm256_insertf128_ps(v2,v128b,1);
  v1 = _mm256_fmadd_ps(v2,vb,v1);   // v1 += v2*vb

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z2 row 2
  v3 = _mm256_insertf128_ps(v3,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z2 row 3
  v3 = _mm256_insertf128_ps(v3,v128b,1);
  v1 = _mm256_fmadd_ps(v3,vc,v1);   // v1 += v3*vc

  s += ninj;
  v128a = _mm_loadu_ps(s);          // Z3 row 2
  v4 = _mm256_insertf128_ps(v4,v128a,0);
  v128b = _mm_loadu_ps(s+ni);       // Z3 row 3
  v4 = _mm256_insertf128_ps(v4,v128b,1);
  v1 = _mm256_fmadd_ps(v4,vd,v1);   // v1 += v4*vd
                                    // split vector of length 8 into 2 vectors of length 4
  vy2 = _mm256_extractf128_ps(v1,0);// Y2 : row 2  (v1 low)
  vy3 = _mm256_extractf128_ps(v1,1);// Y3 : row 3  (v1 high)

// ==== interpolation along Y, vector length is 4 (4 rows) ====

  y0 = cm167*y*(y-one)*(y-two);
  y1 = cp5*(y+one)*(y-one)*(y-two);
  y2 = cm5*y*(y+one)*(y-two);
  y3 = cp167*y*(y+one)*(y-one);

  va4 = _mm_broadcast_ss(&y0);      // promote constants to vectors
  vb4 = _mm_broadcast_ss(&y1);
  vc4 = _mm_broadcast_ss(&y2);
  vd4 = _mm_broadcast_ss(&y3);

  vy0 = _mm_mul_ps(vy0,va4);        //    vy0 * va4
  vy0 = _mm_fmadd_ps(vy1,vb4,vy0);  // += vy1 * vb4
  vy0 = _mm_fmadd_ps(vy2,vc4,vy0);  // += vy2 * vc4
  vy0 = _mm_fmadd_ps(vy3,vd4,vy0);  // += vy3 * vd4
  
  _mm_storeu_ps(dst,vy0);           // store 4 values along X
#else
  y0 = cm167*y*(y-one)*(y-two);
  y1 = cp5*(y+one)*(y-one)*(y-two);
  y2 = cm5*y*(y+one)*(y-two);
  y3 = cp167*y*(y+one)*(y-one);
  for (i=0 ; i<4 ; i++){
    va4[i] = src[i    ]*abcd[0] + src[i    +ninj]*abcd[1] +  src[i    +ninj2]*abcd[2] + src[i    +ninj3]*abcd[3];
    vb4[i] = src[i+ni ]*abcd[0] + src[i+ni +ninj]*abcd[1] +  src[i+ni +ninj2]*abcd[2] + src[i+ni +ninj3]*abcd[3];
    vc4[i] = src[i+ni2]*abcd[0] + src[i+ni2+ninj]*abcd[1] +  src[i+ni2+ninj2]*abcd[2] + src[i+ni2+ninj3]*abcd[3];
    vd4[i] = src[i+ni3]*abcd[0] + src[i+ni3+ninj]*abcd[1] +  src[i+ni3+ninj2]*abcd[2] + src[i+ni3+ninj3]*abcd[3];
    dst[i] = va4[i]*y0 + vb4[i]*y1 + vc4[i]*y2 + vd4[i]*y3;
  }
#endif

// ==== interpolation along x, scalar ====

  x0 = cm167*x*(x-one)*(x-two);
  x1 = cp5*(x+one)*(x-one)*(x-two);
  x2 = cm5*x*(x+one)*(x-two);
  x3 = cp167*x*(x+one)*(x-one);

  return(dst[0]*x0 + dst[1]*x1 + dst[2]*x2 + dst[3]*x3);
}
Exemplo n.º 4
0
void kernel_ssymv_4_lib8_old(int kmax, int kna, float *A, int sda, float *x_n, float *y_n, float *x_t, float *y_t, int tri, int alg)
	{
	
	if(kmax<=0) 
		return;
	
	const int lda = 8;
	
	int k;
	
	__m128
		zeros, temp,
		a_00, a_01, a_02, a_03,
		x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
		x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
	
	zeros = _mm_setzero_ps();

	x_n_0 = _mm_broadcast_ss( &x_n[0] );
	x_n_1 = _mm_broadcast_ss( &x_n[1] );
	x_n_2 = _mm_broadcast_ss( &x_n[2] );
	x_n_3 = _mm_broadcast_ss( &x_n[3] );

	if(alg==-1)
		{
		x_n_0 = _mm_sub_ps( zeros, x_n_0 );
		x_n_1 = _mm_sub_ps( zeros, x_n_1 );
		x_n_2 = _mm_sub_ps( zeros, x_n_2 );
		x_n_3 = _mm_sub_ps( zeros, x_n_3 );
		}

	y_t_0 = _mm_setzero_ps();
	y_t_1 = _mm_setzero_ps();
	y_t_2 = _mm_setzero_ps();
	y_t_3 = _mm_setzero_ps();
	
	k=0;

	// corner
	if(tri==1)
		{
		
		y_n_0 = _mm_load_ss( &y_n[0] );
		x_t_0 = _mm_load_ss( &x_t[0] );
		
		a_00  = _mm_load_ss( &A[0+lda*0] );
		a_01  = _mm_load_ss( &A[0+lda*1] );
		a_02  = _mm_load_ss( &A[0+lda*2] );
		a_03  = _mm_load_ss( &A[0+lda*3] );
		
/*		temp  = _mm_mul_ss( a_00, x_n_0 );*/
/*		y_n_0 = _mm_add_ss( y_n_0, temp );*/
		temp  = _mm_mul_ss( a_00, x_t_0 );
		y_t_0 = _mm_add_ss( y_t_0, temp );
		temp  = _mm_mul_ss( a_01, x_n_1 );
		y_n_0 = _mm_add_ss( y_n_0, temp );
		temp  = _mm_mul_ss( a_01, x_t_0 );
		y_t_1 = _mm_add_ss( y_t_1, temp );
		temp  = _mm_mul_ss( a_02, x_n_2 );
		y_n_0 = _mm_add_ss( y_n_0, temp );
		temp  = _mm_mul_ss( a_02, x_t_0 );
		y_t_2 = _mm_add_ss( y_t_2, temp );
		temp  = _mm_mul_ss( a_03, x_n_3 );
		y_n_0 = _mm_add_ss( y_n_0, temp );
		temp  = _mm_mul_ss( a_03, x_t_0 );
		y_t_3 = _mm_add_ss( y_t_3, temp );
		
		_mm_store_ss( &y_n[0], y_n_0 );


		y_n_0 = _mm_load_ss( &y_n[1] );
		x_t_0 = _mm_load_ss( &x_t[1] );
		
/*		a_00  = _mm_load_ss( &A[1+lda*0] );*/
		a_01  = _mm_load_ss( &A[1+lda*1] );
		a_02  = _mm_load_ss( &A[1+lda*2] );
		a_03  = _mm_load_ss( &A[1+lda*3] );
		
/*		temp  = _mm_mul_ss( a_00, x_n_0 );*/
/*		y_n_0 = _mm_add_ss( y_n_0, temp );*/
/*		temp  = _mm_mul_ss( a_00, x_t_0 );*/
/*		y_t_0 = _mm_add_ss( y_t_0, temp );*/
/*		temp  = _mm_mul_ss( a_01, x_n_1 );*/
/*		y_n_0 = _mm_add_ss( y_n_0, temp );*/
		temp  = _mm_mul_ss( a_01, x_t_0 );
		y_t_1 = _mm_add_ss( y_t_1, temp );
		temp  = _mm_mul_ss( a_02, x_n_2 );
		y_n_0 = _mm_add_ss( y_n_0, temp );
		temp  = _mm_mul_ss( a_02, x_t_0 );
		y_t_2 = _mm_add_ss( y_t_2, temp );
		temp  = _mm_mul_ss( a_03, x_n_3 );
		y_n_0 = _mm_add_ss( y_n_0, temp );
		temp  = _mm_mul_ss( a_03, x_t_0 );
		y_t_3 = _mm_add_ss( y_t_3, temp );
		
		_mm_store_ss( &y_n[1], y_n_0 );


		y_n_0 = _mm_load_ss( &y_n[2] );
		x_t_0 = _mm_load_ss( &x_t[2] );
		
/*		a_00  = _mm_load_ss( &A[2+lda*0] );*/
/*		a_01  = _mm_load_ss( &A[2+lda*1] );*/
		a_02  = _mm_load_ss( &A[2+lda*2] );
		a_03  = _mm_load_ss( &A[2+lda*3] );
		
/*		temp  = _mm_mul_ss( a_00, x_n_0 );*/
/*		y_n_0 = _mm_add_ss( y_n_0, temp );*/
/*		temp  = _mm_mul_ss( a_00, x_t_0 );*/
/*		y_t_0 = _mm_add_ss( y_t_0, temp );*/
/*		temp  = _mm_mul_ss( a_01, x_n_1 );*/
/*		y_n_0 = _mm_add_ss( y_n_0, temp );*/
/*		temp  = _mm_mul_ss( a_01, x_t_0 );*/
/*		y_t_1 = _mm_add_ss( y_t_1, temp );*/
/*		temp  = _mm_mul_ss( a_02, x_n_2 );*/
/*		y_n_0 = _mm_add_ss( y_n_0, temp );*/
		temp  = _mm_mul_ss( a_02, x_t_0 );
		y_t_2 = _mm_add_ss( y_t_2, temp );
		temp  = _mm_mul_ss( a_03, x_n_3 );
		y_n_0 = _mm_add_ss( y_n_0, temp );
		temp  = _mm_mul_ss( a_03, x_t_0 );
		y_t_3 = _mm_add_ss( y_t_3, temp );
		
		_mm_store_ss( &y_n[2], y_n_0 );

		
		y_n_0 = _mm_load_ss( &y_n[3] );
		x_t_0 = _mm_load_ss( &x_t[3] );
		
/*		a_00  = _mm_load_ss( &A[3+lda*0] );*/
/*		a_01  = _mm_load_ss( &A[3+lda*1] );*/
/*		a_02  = _mm_load_ss( &A[3+lda*2] );*/
		a_03  = _mm_load_ss( &A[3+lda*3] );
		
/*		temp  = _mm_mul_ss( a_00, x_n_0 );*/
/*		y_n_0 = _mm_add_ss( y_n_0, temp );*/
/*		temp  = _mm_mul_ss( a_00, x_t_0 );*/
/*		y_t_0 = _mm_add_ss( y_t_0, temp );*/
/*		temp  = _mm_mul_ss( a_01, x_n_1 );*/
/*		y_n_0 = _mm_add_ss( y_n_0, temp );*/
/*		temp  = _mm_mul_ss( a_01, x_t_0 );*/
/*		y_t_1 = _mm_add_ss( y_t_1, temp );*/
/*		temp  = _mm_mul_ss( a_02, x_n_2 );*/
/*		y_n_0 = _mm_add_ss( y_n_0, temp );*/
/*		temp  = _mm_mul_ss( a_02, x_t_0 );*/
/*		y_t_2 = _mm_add_ss( y_t_2, temp );*/
/*		temp  = _mm_mul_ss( a_03, x_n_3 );*/
/*		y_n_0 = _mm_add_ss( y_n_0, temp );*/
		temp  = _mm_mul_ss( a_03, x_t_0 );
		y_t_3 = _mm_add_ss( y_t_3, temp );
		
		_mm_store_ss( &y_n[3], y_n_0 );
		

		A   += 4;
		y_n += 4;
		x_t += 4;

		k += 4;

		}
	for(; k<kna; k++)
		{
		
		y_n_0 = _mm_load_ss( &y_n[0] );
		x_t_0 = _mm_load_ss( &x_t[0] );
		
		a_00  = _mm_load_ss( &A[0+lda*0] );
		a_01  = _mm_load_ss( &A[0+lda*1] );
		a_02  = _mm_load_ss( &A[0+lda*2] );
		a_03  = _mm_load_ss( &A[0+lda*3] );
		
		temp  = _mm_mul_ss( a_00, x_n_0 );
		y_n_0 = _mm_add_ss( y_n_0, temp );
		temp  = _mm_mul_ss( a_00, x_t_0 );
		y_t_0 = _mm_add_ss( y_t_0, temp );
		temp  = _mm_mul_ss( a_01, x_n_1 );
		y_n_0 = _mm_add_ss( y_n_0, temp );
		temp  = _mm_mul_ss( a_01, x_t_0 );
		y_t_1 = _mm_add_ss( y_t_1, temp );
		temp  = _mm_mul_ss( a_02, x_n_2 );
		y_n_0 = _mm_add_ss( y_n_0, temp );
		temp  = _mm_mul_ss( a_02, x_t_0 );
		y_t_2 = _mm_add_ss( y_t_2, temp );
		temp  = _mm_mul_ss( a_03, x_n_3 );
		y_n_0 = _mm_add_ss( y_n_0, temp );
		temp  = _mm_mul_ss( a_03, x_t_0 );
		y_t_3 = _mm_add_ss( y_t_3, temp );
		
		_mm_store_ss( &y_n[0], y_n_0 );

	
		A   += 1;
		y_n += 1;
		x_t += 1;
		
		}
	if(kna>0 || tri==1)
		{
		A += (sda-1)*lda;
		}
	for(; k<kmax-7; k+=8)
		{
		
		y_n_0 = _mm_loadu_ps( &y_n[0] );
		x_t_0 = _mm_loadu_ps( &x_t[0] );
		
		a_00  = _mm_load_ps( &A[0+lda*0] );
		a_01  = _mm_load_ps( &A[0+lda*1] );
		a_02  = _mm_load_ps( &A[0+lda*2] );
		a_03  = _mm_load_ps( &A[0+lda*3] );
		
		temp  = _mm_mul_ps( a_00, x_n_0 );
		y_n_0 = _mm_add_ps( y_n_0, temp );
		temp  = _mm_mul_ps( a_00, x_t_0 );
		y_t_0 = _mm_add_ps( y_t_0, temp );
		temp  = _mm_mul_ps( a_01, x_n_1 );
		y_n_0 = _mm_add_ps( y_n_0, temp );
		temp  = _mm_mul_ps( a_01, x_t_0 );
		y_t_1 = _mm_add_ps( y_t_1, temp );
		temp  = _mm_mul_ps( a_02, x_n_2 );
		y_n_0 = _mm_add_ps( y_n_0, temp );
		temp  = _mm_mul_ps( a_02, x_t_0 );
		y_t_2 = _mm_add_ps( y_t_2, temp );
		temp  = _mm_mul_ps( a_03, x_n_3 );
		y_n_0 = _mm_add_ps( y_n_0, temp );
		temp  = _mm_mul_ps( a_03, x_t_0 );
		y_t_3 = _mm_add_ps( y_t_3, temp );
		
		_mm_storeu_ps( &y_n[0], y_n_0 );
		

		y_n_0 = _mm_loadu_ps( &y_n[4] );
		x_t_0 = _mm_loadu_ps( &x_t[4] );
		
		a_00  = _mm_load_ps( &A[4+lda*0] );
		a_01  = _mm_load_ps( &A[4+lda*1] );
		a_02  = _mm_load_ps( &A[4+lda*2] );
		a_03  = _mm_load_ps( &A[4+lda*3] );
		
		temp  = _mm_mul_ps( a_00, x_n_0 );
		y_n_0 = _mm_add_ps( y_n_0, temp );
		temp  = _mm_mul_ps( a_00, x_t_0 );
		y_t_0 = _mm_add_ps( y_t_0, temp );
		temp  = _mm_mul_ps( a_01, x_n_1 );
		y_n_0 = _mm_add_ps( y_n_0, temp );
		temp  = _mm_mul_ps( a_01, x_t_0 );
		y_t_1 = _mm_add_ps( y_t_1, temp );
		temp  = _mm_mul_ps( a_02, x_n_2 );
		y_n_0 = _mm_add_ps( y_n_0, temp );
		temp  = _mm_mul_ps( a_02, x_t_0 );
		y_t_2 = _mm_add_ps( y_t_2, temp );
		temp  = _mm_mul_ps( a_03, x_n_3 );
		y_n_0 = _mm_add_ps( y_n_0, temp );
		temp  = _mm_mul_ps( a_03, x_t_0 );
		y_t_3 = _mm_add_ps( y_t_3, temp );
		
		_mm_storeu_ps( &y_n[4], y_n_0 );
		

		A   += sda*lda;
		y_n += 8;
		x_t += 8;

		}
	
	for(; k<kmax; k++)
		{
		
		y_n_0 = _mm_load_ss( &y_n[0] );
		x_t_0 = _mm_load_ss( &x_t[0] );
		
		a_00  = _mm_load_ss( &A[0+lda*0] );
		a_01  = _mm_load_ss( &A[0+lda*1] );
		a_02  = _mm_load_ss( &A[0+lda*2] );
		a_03  = _mm_load_ss( &A[0+lda*3] );
		
		temp  = _mm_mul_ss( a_00, x_n_0 );
		y_n_0 = _mm_add_ss( y_n_0, temp );
		temp  = _mm_mul_ss( a_00, x_t_0 );
		y_t_0 = _mm_add_ss( y_t_0, temp );
		temp  = _mm_mul_ss( a_01, x_n_1 );
		y_n_0 = _mm_add_ss( y_n_0, temp );
		temp  = _mm_mul_ss( a_01, x_t_0 );
		y_t_1 = _mm_add_ss( y_t_1, temp );
		temp  = _mm_mul_ss( a_02, x_n_2 );
		y_n_0 = _mm_add_ss( y_n_0, temp );
		temp  = _mm_mul_ss( a_02, x_t_0 );
		y_t_2 = _mm_add_ss( y_t_2, temp );
		temp  = _mm_mul_ss( a_03, x_n_3 );
		y_n_0 = _mm_add_ss( y_n_0, temp );
		temp  = _mm_mul_ss( a_03, x_t_0 );
		y_t_3 = _mm_add_ss( y_t_3, temp );
		
		_mm_store_ss( &y_n[0], y_n_0 );

	
		A   += 1;
		y_n += 1;
		x_t += 1;
		
		}

	// reduction
	y_t_0 = _mm_hadd_ps(y_t_0, y_t_1);
	y_t_2 = _mm_hadd_ps(y_t_2, y_t_3);

	y_t_0 = _mm_hadd_ps(y_t_0, y_t_2);

	if(alg==1)
		{
		y_t_1 = _mm_loadu_ps( &y_t[0] );

		y_t_1 = _mm_add_ps(y_t_1, y_t_0);

		_mm_storeu_ps(&y_t[0], y_t_1);
		}
	else // alg==-1
		{
		y_t_1 = _mm_loadu_ps( &y_t[0] );

		y_t_1 = _mm_sub_ps(y_t_1, y_t_0);

		_mm_storeu_ps(&y_t[0], y_t_1);
		}
	
	}