Example #1
0
// Rounding half away from zero (equivalent to round() from math.h)
// __m256 contains 8 floats, but to simplify the examples, only 4 will be shown
// Initial values to be used in the examples:
// [-12.49  -0.5   1.5   3.7]
static __m256 c63_mm256_roundhalfawayfromzero_ps(const __m256 initial)
{
	const __m256 sign_mask = _mm256_set1_ps(-0.f);
	const __m256 one_half = _mm256_set1_ps(0.5f);
	const __m256 all_zeros = _mm256_setzero_ps();
	const __m256 pos_one = _mm256_set1_ps(1.f);
	const __m256 neg_one = _mm256_set1_ps(-1.f);

	// Creates a mask based on the sign of the floats, true for negative floats
	// Example: [true   true   false   false]
	__m256 less_than_zero = _mm256_cmp_ps(initial, all_zeros, _CMP_LT_OQ);

	// Returns the integer part of the floats
	// Example: [-12.0   -0.0   1.0   3.0]
	__m256 without_fraction = _mm256_round_ps(initial, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));

	// Returns the fraction part of the floats
	// Example: [-0.49   -0.5   0.5   0.7]
	__m256 fraction = _mm256_sub_ps(initial, without_fraction);

	// Absolute values of the fractions
	// Example: [0.49   0.5   0.5   0.7]
	__m256 fraction_abs = _mm256_andnot_ps(sign_mask, fraction);

	// Compares abs(fractions) to 0.5, true if lower
	// Example: [true   false   false   false]
	__m256 less_than_one_half = _mm256_cmp_ps(fraction_abs, one_half, _CMP_LT_OQ);

	// Blends 1.0 and -1.0 depending on the initial sign of the floats
	// Example: [-1.0   -1.0   1.0   1.0]
	__m256 signed_ones = _mm256_blendv_ps(pos_one, neg_one, less_than_zero);

	// Blends the previous result with zeros depending on the fractions that are lower than 0.5
	// Example: [0.0   -1.0   1.0   1.0]
	__m256 to_add = _mm256_blendv_ps(signed_ones, all_zeros, less_than_one_half);

	// Adds the previous result to the floats without fractions
	// Example: [-12.0   -1.0   2.0   4.0]
	return _mm256_add_ps(without_fraction, to_add);
}
Example #2
0
 INLINE const avxi select( const avxb& mask, const avxi& t, const avxi& f ) { return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), mask)); }
Example #3
0
void kernel_ssymv_4_lib8(int kmax, int kna, float *A, int sda, float *x_n, float *y_n, float *x_t, float *y_t, int tri, int alg)
	{
	
	if(kmax<=0) 
		return;
	
	const int lda = 8;
	
	__builtin_prefetch( A + 0*lda );
	__builtin_prefetch( A + 2*lda );

	int 
		k, k_left, ii;
	
	float 
		k_left_d;

	const float mask_f[] = {7.5, 6.5, 5.5, 4.5, 3.5, 2.5, 1.5, 0.5};
	float temp_space[8] = {};

	__m256
		mask,
		zeros, temp,
		a_00, a_01, a_02, a_03,
		x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
		x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
	
	mask = _mm256_loadu_ps( mask_f ); 

	zeros = _mm256_setzero_ps();

	x_n_0 = _mm256_broadcast_ss( &x_n[0] );
	x_n_1 = _mm256_broadcast_ss( &x_n[1] );
	x_n_2 = _mm256_broadcast_ss( &x_n[2] );
	x_n_3 = _mm256_broadcast_ss( &x_n[3] );

	if(alg==-1) // TODO xor
		{
		x_n_0 = _mm256_sub_ps( zeros, x_n_0 );
		x_n_1 = _mm256_sub_ps( zeros, x_n_1 );
		x_n_2 = _mm256_sub_ps( zeros, x_n_2 );
		x_n_3 = _mm256_sub_ps( zeros, x_n_3 );
		}

	y_t_0 = _mm256_setzero_ps();
	y_t_1 = _mm256_setzero_ps();
	y_t_2 = _mm256_setzero_ps();
	y_t_3 = _mm256_setzero_ps();
	
	k=0;

	// corner
	if(tri==1)
		{
		
		k_left = kna-k;

		k_left_d = 8.0 - k_left;
/*printf("\nk_left = %d\n", k_left);*/

/*		y_n_0 = _mm_load_ps( &y_n[0] );*/
/*		y_n_0 = _mm_setzero_ps();*/
		x_t_0 = _mm256_loadu_ps( &x_t[0] );
		x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) );

/*_mm256_storeu_ps( temp_space, x_t_0 );		*/
/*printf("\nx = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/
/*exit(1);*/

		a_00  = _mm256_loadu_ps( &A[0+lda*0] );
		a_00  = _mm256_blend_ps( a_00, zeros, 0x00 );
/*		temp  = _mm256_mul_ps( a_00, x_n_0 );*/
/*		y_n_0 = _mm256_add_ps( y_n_0, temp );*/
		y_n_0 = _mm256_mul_ps( a_00, x_n_0 );
		a_00  = _mm256_blend_ps( a_00, zeros, 0x01 );
		temp  = _mm256_mul_ps( a_00, x_t_0 );
		y_t_0 = _mm256_add_ps( y_t_0, temp );

		a_01  = _mm256_loadu_ps( &A[0+lda*1] );
		a_01  = _mm256_blend_ps( a_01, zeros, 0x01 );
		temp  = _mm256_mul_ps( a_01, x_n_1 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		a_01  = _mm256_blend_ps( a_01, zeros, 0x03 );
		temp  = _mm256_mul_ps( a_01, x_t_0 );
		y_t_1 = _mm256_add_ps( y_t_1, temp );

		a_02  = _mm256_loadu_ps( &A[0+lda*2] );
		a_02  = _mm256_blend_ps( a_02, zeros, 0x03 );
		temp  = _mm256_mul_ps( a_02, x_n_2 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		a_02  = _mm256_blend_ps( a_02, zeros, 0x07 );
		temp  = _mm256_mul_ps( a_02, x_t_0 );
		y_t_2 = _mm256_add_ps( y_t_2, temp );

		a_03  = _mm256_loadu_ps( &A[0+lda*3] );
		a_03  = _mm256_blend_ps( a_03, zeros, 0x07 );
		temp  = _mm256_mul_ps( a_03, x_n_3 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		a_03  = _mm256_blend_ps( a_03, zeros, 0x0f );
		temp  = _mm256_mul_ps( a_03, x_t_0 );
		y_t_3 = _mm256_add_ps( y_t_3, temp );
		
/*_mm256_storeu_ps( temp_space, y_n_0 );		*/
/*printf("\ny = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/
		y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) );
/*_mm256_storeu_ps( temp_space, y_n_0 );		*/
/*printf("\ny = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/
		x_t_0 = _mm256_loadu_ps( &y_n[0] );
		y_n_0 = _mm256_add_ps( y_n_0, x_t_0 );
		_mm256_storeu_ps( &y_n[0], y_n_0 );
		

		A   += k_left;
		y_n += k_left;
		x_t += k_left;

		k   += k_left;

		}

	if(k<kna) // it can be only k_left = {1, 2, 3, 4, 5, 6, 7}
/*	for(; k<kna; k++)*/
		{
		
		k_left = kna-k;

		k_left_d = 8.0 - k_left;
/*printf("\nk_left = %d\n", k_left);*/

/*		y_n_0 = _mm_load_ps( &y_n[0] );*/
/*		y_n_0 = _mm_setzero_ps();*/
		x_t_0 = _mm256_loadu_ps( &x_t[0] );
		x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) );
/*_mm256_storeu_ps( temp_space, x_t_0 );		*/
/*printf("\nx = %f %f %f %f %f %f %f %f \n", temp_space[0], temp_space[1], temp_space[2], temp_space[3], temp_space[4], temp_space[5], temp_space[6], temp_space[7] );*/
		
		a_00  = _mm256_loadu_ps( &A[0+lda*0] );
		a_01  = _mm256_loadu_ps( &A[0+lda*1] );
		a_02  = _mm256_loadu_ps( &A[0+lda*2] );
		a_03  = _mm256_loadu_ps( &A[0+lda*3] );
		
/*		temp  = _mm256_mul_ps( a_00, x_n_0 );*/
/*		y_n_0 = _mm256_add_ps( y_n_0, temp );*/
		y_n_0 = _mm256_mul_ps( a_00, x_n_0 );
		temp  = _mm256_mul_ps( a_00, x_t_0 );
		y_t_0 = _mm256_add_ps( y_t_0, temp );
		temp  = _mm256_mul_ps( a_01, x_n_1 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_01, x_t_0 );
		y_t_1 = _mm256_add_ps( y_t_1, temp );
		temp  = _mm256_mul_ps( a_02, x_n_2 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_02, x_t_0 );
		y_t_2 = _mm256_add_ps( y_t_2, temp );
		temp  = _mm256_mul_ps( a_03, x_n_3 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_03, x_t_0 );
		y_t_3 = _mm256_add_ps( y_t_3, temp );
		
		y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) );
		x_t_0 = _mm256_loadu_ps( &y_n[0] );
		y_n_0 = _mm256_add_ps( y_n_0, x_t_0 );
		_mm256_storeu_ps( &y_n[0], y_n_0 );
/*		_mm256_maskstore_ps( &y_n[0], (__m256i) _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ), y_n_0 );*/

/*		_mm256_storeu_ps( temp_space, y_n_0 );*/
/*		for(ii=0; ii<k_left; ii++)*/
/*			y_n[ii] = temp_space[ii];*/

/*printf("\nk_left = %d\n", k_left);*/
/*exit(1);*/
	
		A   += k_left;
		y_n += k_left;
		x_t += k_left;

		k   += k_left;
		
		}
	if(kna>0 || tri==1)
		{
		A += (sda-1)*lda;
		}
	for(; k<kmax-7; k+=8)
		{
		
		__builtin_prefetch( A + sda*lda + 0*lda );
		__builtin_prefetch( A + sda*lda + 2*lda );

		y_n_0 = _mm256_loadu_ps( &y_n[0] );
		x_t_0 = _mm256_loadu_ps( &x_t[0] );
		
		a_00  = _mm256_load_ps( &A[0+lda*0] );
		temp  = _mm256_mul_ps( a_00, x_n_0 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_00, x_t_0 );
		y_t_0 = _mm256_add_ps( y_t_0, temp );

		a_01  = _mm256_load_ps( &A[0+lda*1] );
		temp  = _mm256_mul_ps( a_01, x_n_1 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_01, x_t_0 );
		y_t_1 = _mm256_add_ps( y_t_1, temp );

		a_02  = _mm256_load_ps( &A[0+lda*2] );
		temp  = _mm256_mul_ps( a_02, x_n_2 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_02, x_t_0 );
		y_t_2 = _mm256_add_ps( y_t_2, temp );

		a_03  = _mm256_load_ps( &A[0+lda*3] );
		temp  = _mm256_mul_ps( a_03, x_n_3 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_03, x_t_0 );
		y_t_3 = _mm256_add_ps( y_t_3, temp );
		
		_mm256_storeu_ps( &y_n[0], y_n_0 );
		

		A   += sda*lda;
		y_n += 8;
		x_t += 8;

		}
	
	if(k<kmax) // it can be only k_left = {1, 2, 3, 4, 5, 6, 7}
		{
		
		k_left = kmax-k;

		k_left_d = 8.0 - k_left;

/*		y_n_0 = _mm_load_ps( &y_n[0] );*/
/*		y_n_0 = _mm_setzero_ps();*/
		x_t_0 = _mm256_loadu_ps( &x_t[0] );
		x_t_0 = _mm256_blendv_ps( x_t_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) );
		
/*printf("\nk_left2 = %d\n", k_left, kmax, k);*/
		a_00  = _mm256_load_ps( &A[0+lda*0] );
/*printf("\nk_left2 = %d\n", k_left);*/
		a_01  = _mm256_load_ps( &A[0+lda*1] );
		a_02  = _mm256_load_ps( &A[0+lda*2] );
		a_03  = _mm256_load_ps( &A[0+lda*3] );
		
/*		temp  = _mm256_mul_ps( a_00, x_n_0 );*/
/*		y_n_0 = _mm256_add_ps( y_n_0, temp );*/
		y_n_0 = _mm256_mul_ps( a_00, x_n_0 );
		temp  = _mm256_mul_ps( a_00, x_t_0 );
		y_t_0 = _mm256_add_ps( y_t_0, temp );
		temp  = _mm256_mul_ps( a_01, x_n_1 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_01, x_t_0 );
		y_t_1 = _mm256_add_ps( y_t_1, temp );
		temp  = _mm256_mul_ps( a_02, x_n_2 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_02, x_t_0 );
		y_t_2 = _mm256_add_ps( y_t_2, temp );
		temp  = _mm256_mul_ps( a_03, x_n_3 );
		y_n_0 = _mm256_add_ps( y_n_0, temp );
		temp  = _mm256_mul_ps( a_03, x_t_0 );
		y_t_3 = _mm256_add_ps( y_t_3, temp );
		
		y_n_0 = _mm256_blendv_ps( y_n_0, zeros, _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ) );
		x_t_0 = _mm256_loadu_ps( &y_n[0] );
		y_n_0 = _mm256_add_ps( y_n_0, x_t_0 );
		_mm256_storeu_ps( &y_n[0], y_n_0 );
/*		_mm256_maskstore_ps( &y_n[0], (__m256i) _mm256_sub_ps( mask, _mm256_broadcast_ss( &k_left_d) ), y_n_0 );*/
	
/*		_mm256_storeu_ps( temp_space, y_n_0 );*/
/*		for(ii=0; ii<k_left; ii++)*/
/*			y_n[ii] = temp_space[ii];*/

/*		A   += 1;*/
/*		y_n += 1;*/
/*		x_t += 1;*/
		
		}

	// reduction
	__m128
		z_0, z_1;

	y_t_0 = _mm256_hadd_ps(y_t_0, y_t_1);
	y_t_2 = _mm256_hadd_ps(y_t_2, y_t_3);

	y_t_0 = _mm256_hadd_ps(y_t_0, y_t_2);

	y_t_1 = _mm256_permute2f128_ps(y_t_0, y_t_0, 0x01);
	
	z_0 = _mm256_castps256_ps128(y_t_0);
	z_1 = _mm256_castps256_ps128(y_t_1);
	
	z_1 = _mm_add_ps(z_0, z_1);

	if(alg==1)
		{
		z_0 = _mm_loadu_ps( &y_t[0] );

		z_0 = _mm_add_ps(z_0, z_1);

		_mm_storeu_ps( &y_t[0], z_0 );
		}
	else // alg==-1
		{
		z_0 = _mm_loadu_ps( &y_t[0] );

		z_0 = _mm_sub_ps(z_0, z_1);

		_mm_storeu_ps( &y_t[0], z_0 );
		}
	
	}