コード例 #1
0
static void NOINLINE transposeX8( const __m256 *v1, __m256 *vout )
{
#if 0 // AVX1
    __m256 a0 = _mm256_unpacklo_ps( v1[ 0 ], v1[ 1 ] );
    __m256 a1 = _mm256_unpackhi_ps( v1[ 0 ], v1[ 1 ] );
    __m256 b0 = _mm256_permute2f128_ps( a0, a1, _MM_SHUFFLE( 0, 2, 0, 0 ) );
    __m256 b1 = _mm256_permute2f128_ps( a0, a1, _MM_SHUFFLE( 0, 3, 0, 1 ) );
    __m256 c0 = _mm256_unpacklo_ps( b0, b1 );
    __m256 c1 = _mm256_unpackhi_ps( b0, b1 );
    vout[ 0 ] = _mm256_permute2f128_ps( c0, c1, _MM_SHUFFLE( 0, 2, 0, 0 ) );
    vout[ 1 ] = _mm256_permute2f128_ps( c0, c1, _MM_SHUFFLE( 0, 3, 0, 1 ) );
#else // AVX2
    static const int ALIGN32 p1[ 8 ] = { 0, 4, 2, 6, 1, 5, 3, 7 };
    static const int ALIGN32 p2[ 8 ] = { 2, 6, 0, 4, 3, 7, 1, 5 };
    const __m256i perm1 = _mm256_load_si256( reinterpret_cast< const __m256i* >( p1 ) );
    const __m256i perm2 = _mm256_load_si256( reinterpret_cast< const __m256i* >( p2 ) );
    __m256 a0 = _mm256_permutevar8x32_ps( v1[ 0 ], perm1 );
    __m256 a1 = _mm256_permutevar8x32_ps( v1[ 1 ], perm2 );
    vout[ 0 ] = _mm256_blend_ps( a0, a1, 0xCC );
    vout[ 1 ] = _mm256_shuffle_ps( a0, a1, 0x4E );
#endif
}
コード例 #2
0
ファイル: testimm-3.c プロジェクト: 0day-ci/gcc
void
test8bit (void)
{
  i1 = _mm_cmpistrm (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistri (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistra (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistrc (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistro (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistrs (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistrz (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  i1 = _mm_cmpestrm (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestri (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestra (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestrc (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestro (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestrs (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestrz (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  b1 = _mm256_blend_ps (b2, b3, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  k1 = _cvtss_sh (f1, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm256_cvtps_ph (b2, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  b1 = _mm256_dp_ps (b2, b3, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  e1 = _mm256_permute2f128_pd (e2, e3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */
  b1 = _mm256_permute2f128_ps (b2, b3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */
  l1 = _mm256_permute2f128_si256 (l2, l3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */
  b1 = _mm256_permute_ps (b2, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_aeskeygenassist_si128 (i2, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_blend_epi16 (i2, i3, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_clmulepi64_si128 (i2, i3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_cvtps_ph (a1, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  d1 = _mm_dp_pd (d2, d3, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  a1 = _mm_dp_ps (a2, a3, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  a1 = _mm_insert_ps (a2, a3, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_mpsadbw_epu8 (i2, i3, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  a1 = _mm_permute_ps (a2, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_slli_si128 (i2, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_srli_si128 (i2, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
}
コード例 #3
0
ファイル: kernel_strmv_avx_lib8.c プロジェクト: mvukov/hpmpc
void kernel_strmv_u_n_8_lib8(int kmax, float *A, float *x, float *y, int alg)
	{

	if(kmax<=0) 
		return;
	
	const int lda = 8;
	
	__builtin_prefetch( A + 0*lda );
	__builtin_prefetch( A + 2*lda );

	int k;

	__m256
		zeros,
		ax_temp,
		a_00, a_01, a_02, a_03,
		x_0, x_1, x_2, x_3,
		y_0, y_0_b, y_0_c, y_0_d, z_0;
	
	zeros = _mm256_setzero_ps();	

	y_0   = _mm256_setzero_ps();	
	y_0_b = _mm256_setzero_ps();	
	y_0_c = _mm256_setzero_ps();	
	y_0_d = _mm256_setzero_ps();	
	

	__builtin_prefetch( A + 4*lda );
	__builtin_prefetch( A + 6*lda );

	a_00 = _mm256_load_ps( &A[0+lda*0] );
	x_0  = _mm256_broadcast_ss( &x[0] );
	x_0  = _mm256_blend_ps( zeros, x_0, 0x01 );
	ax_temp = _mm256_mul_ps( a_00, x_0 );
	y_0 = _mm256_add_ps( y_0, ax_temp );

	a_01 = _mm256_load_ps( &A[0+lda*1] );
	x_1  = _mm256_broadcast_ss( &x[1] );
	x_1  = _mm256_blend_ps( zeros, x_1, 0x03 );
	ax_temp = _mm256_mul_ps( a_01, x_1 );
	y_0_b = _mm256_add_ps( y_0_b, ax_temp );

	a_02 = _mm256_load_ps( &A[0+lda*2] );
	x_2  = _mm256_broadcast_ss( &x[2] );
	x_2  = _mm256_blend_ps( zeros, x_2, 0x07 );
	ax_temp = _mm256_mul_ps( a_02, x_2 );
	y_0_c = _mm256_add_ps( y_0_c, ax_temp );

	a_03 = _mm256_load_ps( &A[0+lda*3] );
	x_3  = _mm256_broadcast_ss( &x[3] );
	x_3  = _mm256_blend_ps( zeros, x_3, 0x0f );
	ax_temp = _mm256_mul_ps( a_03, x_3 );
	y_0_d = _mm256_add_ps( y_0_d, ax_temp );

	A += 4*lda;
	x += 4;

	__builtin_prefetch( A + 4*lda );
	__builtin_prefetch( A + 6*lda );

	a_00 = _mm256_load_ps( &A[0+lda*0] );
	x_0  = _mm256_broadcast_ss( &x[0] );
	x_0  = _mm256_blend_ps( zeros, x_0, 0x1f );
	ax_temp = _mm256_mul_ps( a_00, x_0 );
	y_0 = _mm256_add_ps( y_0, ax_temp );

	a_01 = _mm256_load_ps( &A[0+lda*1] );
	x_1  = _mm256_broadcast_ss( &x[1] );
	x_1  = _mm256_blend_ps( zeros, x_1, 0x3f );
	ax_temp = _mm256_mul_ps( a_01, x_1 );
	y_0_b = _mm256_add_ps( y_0_b, ax_temp );

	a_02 = _mm256_load_ps( &A[0+lda*2] );
	x_2  = _mm256_broadcast_ss( &x[2] );
	x_2  = _mm256_blend_ps( zeros, x_2, 0x7f );
	ax_temp = _mm256_mul_ps( a_02, x_2 );
	y_0_c = _mm256_add_ps( y_0_c, ax_temp );

	a_03 = _mm256_load_ps( &A[0+lda*3] );
	x_3  = _mm256_broadcast_ss( &x[3] );
	ax_temp = _mm256_mul_ps( a_03, x_3 );
	y_0_d = _mm256_add_ps( y_0_d, ax_temp );

	A += 4*lda;
	x += 4;

	k=8;
	for(; k<kmax-7; k+=8)
		{

		__builtin_prefetch( A + 4*lda );
		__builtin_prefetch( A + 6*lda );

		a_00 = _mm256_load_ps( &A[0+lda*0] );
		x_0  = _mm256_broadcast_ss( &x[0] );
		ax_temp = _mm256_mul_ps( a_00, x_0 );
		y_0 = _mm256_add_ps( y_0, ax_temp );

		a_01 = _mm256_load_ps( &A[0+lda*1] );
		x_1  = _mm256_broadcast_ss( &x[1] );
		ax_temp = _mm256_mul_ps( a_01, x_1 );
		y_0_b = _mm256_add_ps( y_0_b, ax_temp );

		a_02 = _mm256_load_ps( &A[0+lda*2] );
		x_2  = _mm256_broadcast_ss( &x[2] );
		ax_temp = _mm256_mul_ps( a_02, x_2 );
		y_0_c = _mm256_add_ps( y_0_c, ax_temp );

		a_03 = _mm256_load_ps( &A[0+lda*3] );
		x_3  = _mm256_broadcast_ss( &x[3] );
		ax_temp = _mm256_mul_ps( a_03, x_3 );
		y_0_d = _mm256_add_ps( y_0_d, ax_temp );

		A += 4*lda;
		x += 4;

		__builtin_prefetch( A + 4*lda );
		__builtin_prefetch( A + 6*lda );

		a_00 = _mm256_load_ps( &A[0+lda*0] );
		x_0  = _mm256_broadcast_ss( &x[0] );
		ax_temp = _mm256_mul_ps( a_00, x_0 );
		y_0 = _mm256_add_ps( y_0, ax_temp );

		a_01 = _mm256_load_ps( &A[0+lda*1] );
		x_1  = _mm256_broadcast_ss( &x[1] );
		ax_temp = _mm256_mul_ps( a_01, x_1 );
		y_0_b = _mm256_add_ps( y_0_b, ax_temp );

		a_02 = _mm256_load_ps( &A[0+lda*2] );
		x_2  = _mm256_broadcast_ss( &x[2] );
		ax_temp = _mm256_mul_ps( a_02, x_2 );
		y_0_c = _mm256_add_ps( y_0_c, ax_temp );

		a_03 = _mm256_load_ps( &A[0+lda*3] );
		x_3  = _mm256_broadcast_ss( &x[3] );
		ax_temp = _mm256_mul_ps( a_03, x_3 );
		y_0_d = _mm256_add_ps( y_0_d, ax_temp );

		A += 4*lda;
		x += 4;

		}
	for(; k<kmax-3; k+=4)
		{

		__builtin_prefetch( A + 4*lda );
		__builtin_prefetch( A + 6*lda );

		a_00 = _mm256_load_ps( &A[0+lda*0] );
		x_0  = _mm256_broadcast_ss( &x[0] );
		ax_temp = _mm256_mul_ps( a_00, x_0 );
		y_0 = _mm256_add_ps( y_0, ax_temp );

		a_01 = _mm256_load_ps( &A[0+lda*1] );
		x_1  = _mm256_broadcast_ss( &x[1] );
		ax_temp = _mm256_mul_ps( a_01, x_1 );
		y_0_b = _mm256_add_ps( y_0_b, ax_temp );

		a_02 = _mm256_load_ps( &A[0+lda*2] );
		x_2  = _mm256_broadcast_ss( &x[2] );
		ax_temp = _mm256_mul_ps( a_02, x_2 );
		y_0_c = _mm256_add_ps( y_0_c, ax_temp );

		a_03 = _mm256_load_ps( &A[0+lda*3] );
		x_3  = _mm256_broadcast_ss( &x[3] );
		ax_temp = _mm256_mul_ps( a_03, x_3 );
		y_0_d = _mm256_add_ps( y_0_d, ax_temp );

		A += 4*lda;
		x += 4;

		}

	y_0   = _mm256_add_ps( y_0  , y_0_c );
	y_0_b = _mm256_add_ps( y_0_b, y_0_d );

	if(kmax%4>=2)
		{

		a_00 = _mm256_load_ps( &A[0+lda*0] );
		x_0  = _mm256_broadcast_ss( &x[0] );
		ax_temp = _mm256_mul_ps( a_00, x_0 );
		y_0 = _mm256_add_ps( y_0, ax_temp );

		a_01 = _mm256_load_ps( &A[0+lda*1] );
		x_1  = _mm256_broadcast_ss( &x[1] );
		ax_temp = _mm256_mul_ps( a_01, x_1 );
		y_0_b = _mm256_add_ps( y_0_b, ax_temp );

		A += 2*lda;
		x += 2;

		}
	
	y_0   = _mm256_add_ps( y_0  , y_0_b );

	if(kmax%2==1)
		{

		a_00 = _mm256_load_ps( &A[0+lda*0] );
		x_0  = _mm256_broadcast_ss( &x[0] );
		ax_temp = _mm256_mul_ps( a_00, x_0 );
		y_0 = _mm256_add_ps( y_0, ax_temp );
		
/*		A += 1*lda;*/
/*		x += 1;*/

		}

	if(alg==0)
		{
		_mm256_storeu_ps(&y[0], y_0);
		}
	else if(alg==1)
		{
		z_0 = _mm256_loadu_ps( &y[0] );

		z_0 = _mm256_add_ps( z_0, y_0 );

		_mm256_storeu_ps(&y[0], z_0);
		}
	else // alg==-1
		{
		z_0 = _mm256_loadu_ps( &y[0] );

		z_0 = _mm256_sub_ps( z_0, y_0 );

		_mm256_storeu_ps(&y[0], z_0);
		}

	}
コード例 #4
0
ファイル: kernel_strmv_avx_lib8.c プロジェクト: mvukov/hpmpc
void kernel_strmv_u_t_8_lib8(int kmax, float *A, int sda, float *x, float *y, int alg)
	{

/*	if(kmax<=0) */
/*		return;*/
	
	const int lda = 8;
/*	const int bs  = 8;*/
	
	__builtin_prefetch( A + 0*lda );
	__builtin_prefetch( A + 2*lda );
	__builtin_prefetch( A + 4*lda );
	__builtin_prefetch( A + 6*lda );

	int
		k;
	
	__m256
		zeros,
		ax_temp,
		a_00, a_01, a_02, a_03,
		x_0,
		y_0, y_1, y_2, y_3, y_4, y_5, y_6, y_7;
	
	zeros = _mm256_setzero_ps();

	y_0 = _mm256_setzero_ps();
	y_1 = _mm256_setzero_ps();
	y_2 = _mm256_setzero_ps();
	y_3 = _mm256_setzero_ps();
	y_4 = _mm256_setzero_ps();
	y_5 = _mm256_setzero_ps();
	y_6 = _mm256_setzero_ps();
	y_7 = _mm256_setzero_ps();

	k=0;
	for(; k<kmax-7; k+=8)
		{
		
		x_0 = _mm256_loadu_ps( &x[0] );

		__builtin_prefetch( A + sda*lda + 0*lda );
		__builtin_prefetch( A + sda*lda + 2*lda );

		a_00 = _mm256_load_ps( &A[0+lda*0] );
		ax_temp = _mm256_mul_ps( a_00, x_0 );
		y_0 = _mm256_add_ps( y_0, ax_temp );
		a_01 = _mm256_load_ps( &A[0+lda*1] );
		ax_temp = _mm256_mul_ps( a_01, x_0 );
		y_1 = _mm256_add_ps( y_1, ax_temp );
		a_02 = _mm256_load_ps( &A[0+lda*2] );
		ax_temp = _mm256_mul_ps( a_02, x_0 );
		y_2 = _mm256_add_ps( y_2, ax_temp );
		a_03 = _mm256_load_ps( &A[0+lda*3] );
		ax_temp = _mm256_mul_ps( a_03, x_0 );
		y_3 = _mm256_add_ps( y_3, ax_temp );
	
		__builtin_prefetch( A + sda*lda + 4*lda );
		__builtin_prefetch( A + sda*lda + 6*lda );

		a_00 = _mm256_load_ps( &A[0+lda*4] );
		ax_temp = _mm256_mul_ps( a_00, x_0 );
		y_4 = _mm256_add_ps( y_4, ax_temp );
		a_01 = _mm256_load_ps( &A[0+lda*5] );
		ax_temp = _mm256_mul_ps( a_01, x_0 );
		y_5 = _mm256_add_ps( y_5, ax_temp );
		a_02 = _mm256_load_ps( &A[0+lda*6] );
		ax_temp = _mm256_mul_ps( a_02, x_0 );
		y_6 = _mm256_add_ps( y_6, ax_temp );
		a_03 = _mm256_load_ps( &A[0+lda*7] );
		ax_temp = _mm256_mul_ps( a_03, x_0 );
		y_7 = _mm256_add_ps( y_7, ax_temp );

		A += sda*lda;
		x += lda;

		}

	x_0 = _mm256_loadu_ps( &x[0] );

	a_00 = _mm256_load_ps( &A[0+lda*0] );
	a_00 = _mm256_blend_ps( zeros, a_00, 0x01 );
	ax_temp = _mm256_mul_ps( a_00, x_0 );
	y_0 = _mm256_add_ps( y_0, ax_temp );
	a_01 = _mm256_load_ps( &A[0+lda*1] );
	a_01 = _mm256_blend_ps( zeros, a_01, 0x03 );
	ax_temp = _mm256_mul_ps( a_01, x_0 );
	y_1 = _mm256_add_ps( y_1, ax_temp );
	a_02 = _mm256_load_ps( &A[0+lda*2] );
	a_02 = _mm256_blend_ps( zeros, a_02, 0x07 );
	ax_temp = _mm256_mul_ps( a_02, x_0 );
	y_2 = _mm256_add_ps( y_2, ax_temp );
	a_03 = _mm256_load_ps( &A[0+lda*3] );
	a_03 = _mm256_blend_ps( zeros, a_03, 0x0f );
	ax_temp = _mm256_mul_ps( a_03, x_0 );
	y_3 = _mm256_add_ps( y_3, ax_temp );

	a_00 = _mm256_load_ps( &A[0+lda*4] );
	a_00 = _mm256_blend_ps( zeros, a_00, 0x1f );
	ax_temp = _mm256_mul_ps( a_00, x_0 );
	y_4 = _mm256_add_ps( y_4, ax_temp );
	a_01 = _mm256_load_ps( &A[0+lda*5] );
	a_01 = _mm256_blend_ps( zeros, a_01, 0x3f );
	ax_temp = _mm256_mul_ps( a_01, x_0 );
	y_5 = _mm256_add_ps( y_5, ax_temp );
	a_02 = _mm256_load_ps( &A[0+lda*6] );
	a_02 = _mm256_blend_ps( zeros, a_02, 0x7f );
	ax_temp = _mm256_mul_ps( a_02, x_0 );
	y_6 = _mm256_add_ps( y_6, ax_temp );
	a_03 = _mm256_load_ps( &A[0+lda*7] );
/*	a_03 = _mm256_blend_ps( zeros, a_03, 0xff );*/
	ax_temp = _mm256_mul_ps( a_03, x_0 );
	y_7 = _mm256_add_ps( y_7, ax_temp );

	// reduction
	__m256
		z_0;

	y_0 = _mm256_hadd_ps(y_0, y_1);
	y_2 = _mm256_hadd_ps(y_2, y_3);
	y_4 = _mm256_hadd_ps(y_4, y_5);
	y_6 = _mm256_hadd_ps(y_6, y_7);

	y_0 = _mm256_hadd_ps(y_0, y_2);
	y_4 = _mm256_hadd_ps(y_4, y_6);

	y_1 = _mm256_permute2f128_ps(y_0, y_4, 0x20);
	y_2 = _mm256_permute2f128_ps(y_0, y_4, 0x31);
	
	y_0 = _mm256_add_ps(y_1, y_2);

	// store
	if(alg==0)
		{
		_mm256_storeu_ps(&y[0], y_0);
		}
	else if(alg==1)
		{
		z_0 = _mm256_loadu_ps( &y[0] );

		z_0 = _mm256_add_ps(z_0, y_0);

		_mm256_storeu_ps(&y[0], z_0);
		}
	else // alg==-1
		{
		z_0 = _mm256_loadu_ps( &y[0] );

		z_0 = _mm256_sub_ps(z_0, y_0);

		_mm256_storeu_ps(&y[0], z_0);
		}

	}
コード例 #5
0
ファイル: main.cpp プロジェクト: minh0722/HPC2015
M_ALWAYS_INLINE
static void bar(float (& input)[8]) 
{
    
    /*
	static constexpr uint_fast8_t idx[][2] = {
		{0, 1}, {3, 2}, {4, 5}, {7, 6}, // (1)
		{0, 2}, {1, 3}, {6, 4}, {7, 5}, // (2)
		{0, 1}, {2, 3}, {5, 4}, {7, 6}, // (3)
		{0, 4}, {1, 5}, {2, 6}, {3, 7}, // (4)
		{0, 2}, {1, 3}, {4, 6}, {5, 7}, // (5)
		{0, 1}, {2, 3}, {4, 5}, {6, 7} // (6)
	};
    */
    // Индекса трябва да представим в по удобен вид за
    // AVX инструкциите. Няма смисъл от цикъл и после развиване
    // защото (4)-тия случай е специален... По добре на ръка.
    
    static constexpr int blend_mask_1 =0b10011001;
    static constexpr int blend_mask_2=0b11000011;
    static constexpr int blend_mask_3 =0b10100101;
    static constexpr int blend_mask_4 =0b00001111;
    static constexpr int blend_mask_5=0b00110011;
    static constexpr int blend_mask_6=0b01010101;
    
    // Отговаря на (1), (3) и (6)
    static constexpr int permute_mask_1=0b10110001;
    
    
    // Отговаря на (2) и (5)
    static constexpr int permute_mask_2=0b01001110;
    
    
    __m256 result= _mm256_load_ps(input);
    
    // (1)  
    
    __m256 mapped=_mm256_permute_ps(result,permute_mask_1);
    
    __m256 min=_mm256_min_ps(result,mapped);
    __m256 max=_mm256_max_ps(result,mapped);
    
    result=_mm256_blend_ps(max,min,blend_mask_1);
    
    // (2)
    
    mapped=_mm256_permute_ps(result,permute_mask_2);
    
    min=_mm256_min_ps(result,mapped);
    max=_mm256_max_ps(result,mapped);
    
    result=_mm256_blend_ps(max,min,blend_mask_2);
    
    // (3)
    
    mapped=_mm256_permute_ps(result,permute_mask_1);
    
    min=_mm256_min_ps(result,mapped);
    max=_mm256_max_ps(result,mapped);
    
    result=_mm256_blend_ps(max,min,blend_mask_3);
    
    // (4) Специалния случай тук трябва да пермутираме
    // между двете половини на YMM регистъра.
    
    mapped=_mm256_permute2f128_ps(result,result,1);
   
    min=_mm256_min_ps(result,mapped);
    max=_mm256_max_ps(result,mapped);
    
    result=_mm256_blend_ps(max,min,blend_mask_4);
   
    // (5)
    
    mapped=_mm256_permute_ps(result,permute_mask_2);
    
    min=_mm256_min_ps(result,mapped);
    max=_mm256_max_ps(result,mapped);
    
    result=_mm256_blend_ps(max,min,blend_mask_5);
    
    // (6)
    
    mapped=_mm256_permute_ps(result,permute_mask_1);
    
    min=_mm256_min_ps(result,mapped);
    max=_mm256_max_ps(result,mapped);
    
    result=_mm256_blend_ps(max,min,blend_mask_6);
     /**/
    _mm256_store_ps(input,result);
}
コード例 #6
0
ファイル: median7.cpp プロジェクト: chenbk85/tps5
inline  __m256 _mm256_broadcast_3_ss(__m256 a) {
	__m256 b = _mm256_permute_ps(a, _MM_SHUFFLE(3, 3, 3, 3));
	return _mm256_blend_ps(b, _mm256_permute2f128_ps(b, b, 1), 0xF0);
}
コード例 #7
0
ファイル: median7.cpp プロジェクト: chenbk85/tps5
inline  __m256 _mm256_broadcast_lo_ss(__m256 a) {
	__m256 b = _mm256_permute_ps(a, _MM_SHUFFLE(0, 0, 0, 0)); \
	return _mm256_blend_ps(b, _mm256_permute2f128_ps(b, b, 1), 0xF0); \
}
コード例 #8
0
ファイル: kernel_ssymv_avx_lib8.c プロジェクト: giaf/hpmpc
void kernel_ssymv_4_lib8(int kmax, int kna, float *A, int sda, float *x_n, float *y_n, float *x_t, float *y_t, int tri, int alg)
	{
	
	if(kmax<=0) 
		return;
	
	const int lda = 8;
	
	__builtin_prefetch( A + 0*lda );
	__builtin_prefetch( A + 2*lda );

	int 
		k, k_left, ii;
	
	float 
		k_left_d;

	const float mask_f[] = {7.5, 6.5, 5.5, 4.5, 3.5, 2.5, 1.5, 0.5};
	float temp_space[8] = {};

	__m256
		mask,
		zeros, temp,
		a_00, a_01, a_02, a_03,
		x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
		x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
	
	mask = _mm256_loadu_ps( mask_f ); 

	zeros = _mm256_setzero_ps();

	x_n_0 = _mm256_broadcast_ss( &x_n[0] );
	x_n_1 = _mm256_broadcast_ss( &x_n[1] );
	x_n_2 = _mm256_broadcast_ss( &x_n[2] );
	x_n_3 = _mm256_broadcast_ss( &x_n[3] );

	if(alg==-1) // TODO xor
		{
		x_n_0 = _mm256_sub_ps( zeros, x_n_0 );
		x_n_1 = _mm256_sub_ps( zeros, x_n_1 );
		x_n_2 = _mm256_sub_ps( zeros, x_n_2 );
		x_n_3 = _mm256_sub_ps( zeros, x_n_3 );
		}

	y_t_0 = _mm256_setzero_ps();
	y_t_1 = _mm256_setzero_ps();
	y_t_2 = _mm256_setzero_ps();
	y_t_3 = _mm256_setzero_ps();
	
	k=0;

	// corner
	if(tri==1)
		{
		
		k_left = kna-k;

		k_left_d = 8.0 - k_left;
/*printf("\nk_left = %d\n", k_left);*/

/*		y_n_0 = _mm_load_ps( &y_n[0] );*/
/*		y_n_0 = _mm_setzero_ps();*/
		x_t_0 = _mm256_loadu_ps( &x_t[0] );
		x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) );

/*_mm256_storeu_ps( temp_space, x_t_0 );		*/
/*printf("\nx = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/
/*exit(1);*/

		a_00  = _mm256_loadu_ps( &A[0+lda*0] );
		a_00  = _mm256_blend_ps( a_00, zeros, 0x00 );
/*		temp  = _mm256_mul_ps( a_00, x_n_0 );*/
/*		y_n_0 = _mm256_add_ps( y_n_0, temp );*/
		y_n_0 = _mm256_mul_ps( a_00, x_n_0 );
		a_00  = _mm256_blend_ps( a_00, zeros, 0x01 );
		temp  = _mm256_mul_ps( a_00, x_t_0 );
		y_t_0 = _mm256_add_ps( y_t_0, temp );

		a_01  = _mm256_loadu_ps( &A[0+lda*1] );
		a_01  = _mm256_blend_ps( a_01, zeros, 0x01 );
		temp  = _mm256_mul_ps( a_01, x_n_1 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		a_01  = _mm256_blend_ps( a_01, zeros, 0x03 );
		temp  = _mm256_mul_ps( a_01, x_t_0 );
		y_t_1 = _mm256_add_ps( y_t_1, temp );

		a_02  = _mm256_loadu_ps( &A[0+lda*2] );
		a_02  = _mm256_blend_ps( a_02, zeros, 0x03 );
		temp  = _mm256_mul_ps( a_02, x_n_2 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		a_02  = _mm256_blend_ps( a_02, zeros, 0x07 );
		temp  = _mm256_mul_ps( a_02, x_t_0 );
		y_t_2 = _mm256_add_ps( y_t_2, temp );

		a_03  = _mm256_loadu_ps( &A[0+lda*3] );
		a_03  = _mm256_blend_ps( a_03, zeros, 0x07 );
		temp  = _mm256_mul_ps( a_03, x_n_3 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		a_03  = _mm256_blend_ps( a_03, zeros, 0x0f );
		temp  = _mm256_mul_ps( a_03, x_t_0 );
		y_t_3 = _mm256_add_ps( y_t_3, temp );
		
/*_mm256_storeu_ps( temp_space, y_n_0 );		*/
/*printf("\ny = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/
		y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) );
/*_mm256_storeu_ps( temp_space, y_n_0 );		*/
/*printf("\ny = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/
		x_t_0 = _mm256_loadu_ps( &y_n[0] );
		y_n_0 = _mm256_add_ps( y_n_0, x_t_0 );
		_mm256_storeu_ps( &y_n[0], y_n_0 );
		

		A   += k_left;
		y_n += k_left;
		x_t += k_left;

		k   += k_left;

		}

	if(k<kna) // it can be only k_left = {1, 2, 3, 4, 5, 6, 7}
/*	for(; k<kna; k++)*/
		{
		
		k_left = kna-k;

		k_left_d = 8.0 - k_left;
/*printf("\nk_left = %d\n", k_left);*/

/*		y_n_0 = _mm_load_ps( &y_n[0] );*/
/*		y_n_0 = _mm_setzero_ps();*/
		x_t_0 = _mm256_loadu_ps( &x_t[0] );
		x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) );
/*_mm256_storeu_ps( temp_space, x_t_0 );		*/
/*printf("\nx = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/
		
		a_00  = _mm256_loadu_ps( &A[0+lda*0] );
		a_01  = _mm256_loadu_ps( &A[0+lda*1] );
		a_02  = _mm256_loadu_ps( &A[0+lda*2] );
		a_03  = _mm256_loadu_ps( &A[0+lda*3] );
		
/*		temp  = _mm256_mul_ps( a_00, x_n_0 );*/
/*		y_n_0 = _mm256_add_ps( y_n_0, temp );*/
		y_n_0 = _mm256_mul_ps( a_00, x_n_0 );
		temp  = _mm256_mul_ps( a_00, x_t_0 );
		y_t_0 = _mm256_add_ps( y_t_0, temp );
		temp  = _mm256_mul_ps( a_01, x_n_1 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_01, x_t_0 );
		y_t_1 = _mm256_add_ps( y_t_1, temp );
		temp  = _mm256_mul_ps( a_02, x_n_2 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_02, x_t_0 );
		y_t_2 = _mm256_add_ps( y_t_2, temp );
		temp  = _mm256_mul_ps( a_03, x_n_3 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_03, x_t_0 );
		y_t_3 = _mm256_add_ps( y_t_3, temp );
		
		y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) );
		x_t_0 = _mm256_loadu_ps( &y_n[0] );
		y_n_0 = _mm256_add_ps( y_n_0, x_t_0 );
		_mm256_storeu_ps( &y_n[0], y_n_0 );
/*		_mm256_maskstore_ps( &y_n[0], (__m256i) _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ), y_n_0 );*/

/*		_mm256_storeu_ps( temp_space, y_n_0 );*/
/*		for(ii=0; ii<k_left; ii++)*/
/*			y_n[ii] = temp_space[ii];*/

/*printf("\nk_left = %d\n", k_left);*/
/*exit(1);*/
	
		A   += k_left;
		y_n += k_left;
		x_t += k_left;

		k   += k_left;
		
		}
	if(kna>0 || tri==1)
		{
		A += (sda-1)*lda;
		}
	for(; k<kmax-7; k+=8)
		{
		
		__builtin_prefetch( A + sda*lda + 0*lda );
		__builtin_prefetch( A + sda*lda + 2*lda );

		y_n_0 = _mm256_loadu_ps( &y_n[0] );
		x_t_0 = _mm256_loadu_ps( &x_t[0] );
		
		a_00  = _mm256_load_ps( &A[0+lda*0] );
		temp  = _mm256_mul_ps( a_00, x_n_0 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_00, x_t_0 );
		y_t_0 = _mm256_add_ps( y_t_0, temp );

		a_01  = _mm256_load_ps( &A[0+lda*1] );
		temp  = _mm256_mul_ps( a_01, x_n_1 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_01, x_t_0 );
		y_t_1 = _mm256_add_ps( y_t_1, temp );

		a_02  = _mm256_load_ps( &A[0+lda*2] );
		temp  = _mm256_mul_ps( a_02, x_n_2 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_02, x_t_0 );
		y_t_2 = _mm256_add_ps( y_t_2, temp );

		a_03  = _mm256_load_ps( &A[0+lda*3] );
		temp  = _mm256_mul_ps( a_03, x_n_3 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_03, x_t_0 );
		y_t_3 = _mm256_add_ps( y_t_3, temp );
		
		_mm256_storeu_ps( &y_n[0], y_n_0 );
		

		A   += sda*lda;
		y_n += 8;
		x_t += 8;

		}
	
	if(k<kmax) // it can be only k_left = {1, 2, 3, 4, 5, 6, 7}
		{
		
		k_left = kmax-k;

		k_left_d = 8.0 - k_left;

/*		y_n_0 = _mm_load_ps( &y_n[0] );*/
/*		y_n_0 = _mm_setzero_ps();*/
		x_t_0 = _mm256_loadu_ps( &x_t[0] );
		x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) );
		
/*printf("\nk_left2 = %d\n", k_left, kmax, k);*/
		a_00  = _mm256_load_ps( &A[0+lda*0] );
/*printf("\nk_left2 = %d\n", k_left);*/
		a_01  = _mm256_load_ps( &A[0+lda*1] );
		a_02  = _mm256_load_ps( &A[0+lda*2] );
		a_03  = _mm256_load_ps( &A[0+lda*3] );
		
/*		temp  = _mm256_mul_ps( a_00, x_n_0 );*/
/*		y_n_0 = _mm256_add_ps( y_n_0, temp );*/
		y_n_0 = _mm256_mul_ps( a_00, x_n_0 );
		temp  = _mm256_mul_ps( a_00, x_t_0 );
		y_t_0 = _mm256_add_ps( y_t_0, temp );
		temp  = _mm256_mul_ps( a_01, x_n_1 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_01, x_t_0 );
		y_t_1 = _mm256_add_ps( y_t_1, temp );
		temp  = _mm256_mul_ps( a_02, x_n_2 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_02, x_t_0 );
		y_t_2 = _mm256_add_ps( y_t_2, temp );
		temp  = _mm256_mul_ps( a_03, x_n_3 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_03, x_t_0 );
		y_t_3 = _mm256_add_ps( y_t_3, temp );
		
		y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) );
		x_t_0 = _mm256_loadu_ps( &y_n[0] );
		y_n_0 = _mm256_add_ps( y_n_0, x_t_0 );
		_mm256_storeu_ps( &y_n[0], y_n_0 );
/*		_mm256_maskstore_ps( &y_n[0], (__m256i) _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ), y_n_0 );*/
	
/*		_mm256_storeu_ps( temp_space, y_n_0 );*/
/*		for(ii=0; ii<k_left; ii++)*/
/*			y_n[ii] = temp_space[ii];*/

/*		A   += 1;*/
/*		y_n += 1;*/
/*		x_t += 1;*/
		
		}

	// reduction
	__m128
		z_0, z_1;

	y_t_0 = _mm256_hadd_ps(y_t_0, y_t_1);
	y_t_2 = _mm256_hadd_ps(y_t_2, y_t_3);

	y_t_0 = _mm256_hadd_ps(y_t_0, y_t_2);

	y_t_1 = _mm256_permute2f128_ps(y_t_0, y_t_0, 0x01);
	
	z_0 = _mm256_castps256_ps128(y_t_0);
	z_1 = _mm256_castps256_ps128(y_t_1);
	
	z_1 = _mm_add_ps(z_0, z_1);

	if(alg==1)
		{
		z_0 = _mm_loadu_ps( &y_t[0] );

		z_0 = _mm_add_ps(z_0, z_1);

		_mm_storeu_ps( &y_t[0], z_0 );
		}
	else // alg==-1
		{
		z_0 = _mm_loadu_ps( &y_t[0] );

		z_0 = _mm_sub_ps(z_0, z_1);

		_mm_storeu_ps( &y_t[0], z_0 );
		}
	
	}