예제 #1
0
파일: Color.cpp 프로젝트: Gallaecio/0ad
static SColor4ub sse_ConvertRGBColorTo4ub(const RGBColor& src)
{
	const __m128 zero = _mm_setzero_ps();
	const __m128 _255 = _mm_set_ss(255.0f);
	__m128 r = _mm_load_ss(&src.X);
	__m128 g = _mm_load_ss(&src.Y);
	__m128 b = _mm_load_ss(&src.Z);

	// C = min(255, 255*max(C, 0)) ( == clamp(255*C, 0, 255) )
	r = _mm_max_ss(r, zero);
	g = _mm_max_ss(g, zero);
	b = _mm_max_ss(b, zero);

	r = _mm_mul_ss(r, _255);
	g = _mm_mul_ss(g, _255);
	b = _mm_mul_ss(b, _255);

	r = _mm_min_ss(r, _255);
	g = _mm_min_ss(g, _255);
	b = _mm_min_ss(b, _255);

	// convert to integer and combine channels using bit logic
	int ri = _mm_cvtss_si32(r);
	int gi = _mm_cvtss_si32(g);
	int bi = _mm_cvtss_si32(b);

	return SColor4ub(ri, gi, bi, 0xFF);
}
void SubpixelMaximizer::fitUsingSSE3(float coef[FitMatrix::ROWS], const signed short data[3][3][3]) const
{
  assert(FitMatrix::PADDEDCOLS == 32);
  __m128 localFitMatrixScale = _mm_set_ss(fitMatrix.scale);
  const short* localFitMatrix = fitMatrix();
  // Load data into four SSE Registers
  __m128i x[4];
  signed short* dataFlat = (signed short*) data; // flat arraw of 27 signed shorts
  x[0] = _mm_loadu_si128((__m128i*)(dataFlat + 0));
  x[1] = _mm_loadu_si128((__m128i*)(dataFlat + 8));
  x[2] = _mm_loadu_si128((__m128i*)(dataFlat + 16));
  x[3] = _mm_loadu_si128((__m128i*)(dataFlat + 24));
  x[3] = _mm_srli_si128(_mm_slli_si128(x[3], 10), 10);   // Clear dataFlat[27..31]

  for(int i = 0; i < FitMatrix::ROWS; i++)
  {
    // Compute scalar product between ((float*)x)[0..31] and localFitMatrix
    __m128i sum =             _mm_madd_epi16(x[0], *(__m128i*)(localFitMatrix + 0));
    sum = _mm_add_epi32(sum, _mm_madd_epi16(x[1], *(__m128i*)(localFitMatrix + 8)));
    sum = _mm_add_epi32(sum, _mm_madd_epi16(x[2], *(__m128i*)(localFitMatrix + 16)));
    sum = _mm_add_epi32(sum, _mm_madd_epi16(x[3], *(__m128i*)(localFitMatrix + 24)));
    sum = _mm_hadd_epi32(sum, sum);
    sum = _mm_hadd_epi32(sum, sum);
    _mm_store_ss(coef + i, _mm_mul_ss(_mm_cvtepi32_ps(sum), localFitMatrixScale));
    localFitMatrix += 32;
  }
}
예제 #3
0
파일: ibMtx4.cpp 프로젝트: Innabus/Innabus
ibMtx4& ibMtx4::Invert()
{
	f32* src = &data.a[0][0];
	__m128 minor0, minor1, minor2, minor3;
	__m128 row0, row1, row2, row3;
	__m128 det, tmp1;
#if !defined NDEBUG || defined STATIC
	// Suppress RTC error for uninit vars
	f32 init = 0.f;
	row3 = row1 = tmp1 = _mm_load_ps1( &init );
#endif // NDEBUG
	tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src)), (__m64*)(src+ 4));
	row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(src+8)), (__m64*)(src+12));
	row0 = _mm_shuffle_ps(tmp1, row1, 0x88);
	row1 = _mm_shuffle_ps(row1, tmp1, 0xDD);
	tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6));
	row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(src+10)), (__m64*)(src+14));
	row2 = _mm_shuffle_ps(tmp1, row3, 0x88);
	row3 = _mm_shuffle_ps(row3, tmp1, 0xDD);
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row2, row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor0 = _mm_mul_ps(row1, tmp1);
	minor1 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
	minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
	minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E);
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row1, row2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
	minor3 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
	minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
	minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E);
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	row2 = _mm_shuffle_ps(row2, row2, 0x4E);
	minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
	minor2 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
	minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
	minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E);
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row0, row1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
	minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row0, row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
	minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
	minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row0, row2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
	minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
	// -----------------------------------------------
	det = _mm_mul_ps(row0, minor0);
	det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
	det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
	tmp1 = _mm_rcp_ss(det);
	det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)));
	det = _mm_shuffle_ps(det, det, 0x00);
	minor0 = _mm_mul_ps(det, minor0);
	_mm_storel_pi((__m64*)(src), minor0);
	_mm_storeh_pi((__m64*)(src+2), minor0);
	minor1 = _mm_mul_ps(det, minor1);
	_mm_storel_pi((__m64*)(src+4), minor1);
	_mm_storeh_pi((__m64*)(src+6), minor1);
	minor2 = _mm_mul_ps(det, minor2);
	_mm_storel_pi((__m64*)(src+ 8), minor2);
	_mm_storeh_pi((__m64*)(src+10), minor2);
	minor3 = _mm_mul_ps(det, minor3);
	_mm_storel_pi((__m64*)(src+12), minor3);
	_mm_storeh_pi((__m64*)(src+14), minor3);

	return *this;
}
test (__m128 s1, __m128 s2)
{
  return _mm_mul_ss (s1, s2); 
}
예제 #5
0
// Does inverse according to Cramers Rule
// See ftp://download.intel.com/design/PentiumIII/sml/24504301.pdf
void Mat44::Cramers_Inverse_SSE(const Mat44 *out, f32 &detv) const
{
	f32 *src = (f32*)&mat;

	__m128 minor0=_mm_setzero_ps(), minor1=_mm_setzero_ps(), minor2=_mm_setzero_ps(), minor3=_mm_setzero_ps();
	__m128 row0=_mm_setzero_ps(),   row1=_mm_setzero_ps(),   row2=_mm_setzero_ps(),   row3=_mm_setzero_ps();
	__m128 det=_mm_setzero_ps(),    tmp1=_mm_setzero_ps();

	tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src)), (__m64*)(src+ 4));
	row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(src+8)), (__m64*)(src+12));
	row0 = _mm_shuffle_ps(tmp1, row1, 0x88);
	row1 = _mm_shuffle_ps(row1, tmp1, 0xDD);
	tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6));
	row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(src+10)), (__m64*)(src+14));
	row2 = _mm_shuffle_ps(tmp1, row3, 0x88);
	row3 = _mm_shuffle_ps(row3, tmp1, 0xDD);
	
	tmp1 = _mm_mul_ps(row2, row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor0 = _mm_mul_ps(row1, tmp1);
	minor1 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
	minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
	minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E);
	
	tmp1 = _mm_mul_ps(row1, row2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
	minor3 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
	minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
	minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E);
	
	tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	row2 = _mm_shuffle_ps(row2, row2, 0x4E);
	minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
	minor2 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
	minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
	minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E);
	
	tmp1 = _mm_mul_ps(row0, row1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
	minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
	
	tmp1 = _mm_mul_ps(row0, row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
	minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
	minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
	
	tmp1 = _mm_mul_ps(row0, row2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
	minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
	
	det = _mm_mul_ps(row0, minor0);
	det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
	det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
	tmp1 = _mm_rcp_ss(det);
	det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)));
	det = _mm_shuffle_ps(det, det, 0x00);
	
	_mm_store_ss(&detv, det);

	Mat44 t;
	if(out)
	{
		src = (f32*)out->mat;
	}
	else
	{
		src = t.mat;
	}

	minor0 = _mm_mul_ps(det, minor0);
	_mm_storel_pi((__m64*)(src), minor0);
	_mm_storeh_pi((__m64*)(src+2), minor0);
	
	minor1 = _mm_mul_ps(det, minor1);
	_mm_storel_pi((__m64*)(src+4), minor1);
	_mm_storeh_pi((__m64*)(src+6), minor1);
	
	minor2 = _mm_mul_ps(det, minor2);
	_mm_storel_pi((__m64*)(src+ 8), minor2);
	_mm_storeh_pi((__m64*)(src+10), minor2);
	
	minor3 = _mm_mul_ps(det, minor3);
	_mm_storel_pi((__m64*)(src+12), minor3);
	_mm_storeh_pi((__m64*)(src+14), minor3);
};
예제 #6
0
void kernel_sgemv_t_1_lib4(int kmax, int kna, float *A, int sda, float *x, float *y, int alg)
	{
	if(kmax<=0) return;
	
	const int lda = 4;
	
	int k;
	int ka = kmax-kna; // number from aligned positon
	
	__m128
		a_00_10_20_30,
		x_0_1_2_3,
		y_0, y_1;
	
	y_0 = _mm_setzero_ps();	

	k = 0;
	if(kna>0)
		{
		for(; k<kna; k++)
			{
		
			x_0_1_2_3 = _mm_load_ss( &x[0] );

			a_00_10_20_30 = _mm_load_ss( &A[0+lda*0] );
		
/*			y_0 += a_00_10_20_30 * x_0_1_2_3;*/
			a_00_10_20_30 = _mm_mul_ss( a_00_10_20_30, x_0_1_2_3 );
			y_0 = _mm_add_ss( y_0, a_00_10_20_30 );
		
			x += 1;
			A += 1;

			}

		A += (sda-1)*lda;
		}

	k = 0;
	for(; k<ka-3; k+=4)
		{
		
		x_0_1_2_3 = _mm_loadu_ps( &x[0] );

		a_00_10_20_30 = _mm_load_ps( &A[0+lda*0] );
		
/*		y_0 += a_00_10_20_30 * x_0_1_2_3;*/
		a_00_10_20_30 = _mm_mul_ps( a_00_10_20_30, x_0_1_2_3 );
		y_0 = _mm_add_ps( y_0, a_00_10_20_30 );
		
		x += 4;
		A += 4;

		A += (sda-1)*lda;

		}
	for(; k<ka; k++)
		{
		
		x_0_1_2_3 = _mm_load_ss( &x[0] );

		a_00_10_20_30 = _mm_load_ss( &A[0+lda*0] );
	
/*		y_0 += a_00_10_20_30 * x_0_1_2_3;*/
		a_00_10_20_30 = _mm_mul_ss( a_00_10_20_30, x_0_1_2_3 );
		y_0 = _mm_add_ss( y_0, a_00_10_20_30 );
	
		x += 1;
		A += 1;
		
		}

	__m128
		y_0_1_2_3;

	y_1 = _mm_setzero_ps();
	y_0 = _mm_hadd_ps(y_0, y_1);
	y_0 = _mm_hadd_ps(y_0, y_1);

	if(alg==0)
		{
		_mm_store_ss(&y[0], y_0);
		}
	else if(alg==1)
		{
		y_0_1_2_3 = _mm_load_ss( &y[0] );

		y_0_1_2_3 = _mm_add_ss(y_0_1_2_3, y_0);
	
		_mm_store_ss(&y[0], y_0_1_2_3);
		}
	else // alg==-1
		{
		y_0_1_2_3 = _mm_load_ss( &y[0] );

		y_0_1_2_3 = _mm_sub_ss(y_0_1_2_3, y_0);
	
		_mm_store_ss(&y[0], y_0_1_2_3);
		}

	}
예제 #7
0
// Inverts a 4x4 matrix and returns the determinate
inline float invert_44_matrix(float* src)
{
	// Code pulled from "Streaming SIMD Extensions - Inverse of 4x4 Matrix"
	// by Intel.
	// ftp://download.intel.com/design/PentiumIII/sml/24504301.pdf
	__m128 minor0;
	__m128 minor1;
	__m128 minor2;
	__m128 minor3;
	__m128 row0;
	__m128 row1;
	__m128 row2;
	__m128 row3;
	__m128 det;
	__m128 tmp1;
	tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src)), (__m64*)(src+ 4));
	row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(src+8)), (__m64*)(src+12));
	row0 = _mm_shuffle_ps(tmp1, row1, 0x88);
	row1 = _mm_shuffle_ps(row1, tmp1, 0xDD);
	tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6));
	row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(src+10)), (__m64*)(src+14));
	row2 = _mm_shuffle_ps(tmp1, row3, 0x88);
	row3 = _mm_shuffle_ps(row3, tmp1, 0xDD);
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row2, row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor0 = _mm_mul_ps(row1, tmp1);
	minor1 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
	minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
	minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E);
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row1, row2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
	minor3 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
	minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
	minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E);
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	row2 = _mm_shuffle_ps(row2, row2, 0x4E);
	minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
	minor2 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
	minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
	minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E);
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row0, row1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
	minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row0, row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
	minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
	minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row0, row2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
	minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
	// -----------------------------------------------
	det = _mm_mul_ps(row0, minor0);
	det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
	det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
	tmp1 = _mm_rcp_ss(det);
	det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)));
	det = _mm_shuffle_ps(det, det, 0x00);
	minor0 = _mm_mul_ps(det, minor0);
	_mm_storel_pi((__m64*)(src), minor0);
	_mm_storeh_pi((__m64*)(src+2), minor0);
	minor1 = _mm_mul_ps(det, minor1);
	_mm_storel_pi((__m64*)(src+4), minor1);
	_mm_storeh_pi((__m64*)(src+6), minor1);
	minor2 = _mm_mul_ps(det, minor2);
	_mm_storel_pi((__m64*)(src+ 8), minor2);
	_mm_storeh_pi((__m64*)(src+10), minor2);
	minor3 = _mm_mul_ps(det, minor3);
	_mm_storel_pi((__m64*)(src+12), minor3);
	_mm_storeh_pi((__m64*)(src+14), minor3);

	return det[0];
}
예제 #8
0
// from intel 
Matrix4x4SSE &Matrix4x4SSE::Invert(void)
{
	float *src = &m_Vec0[0];

	__m128 minor0, minor1, minor2, minor3;
	__m128 det;

	// fool compiler only..
	__m128 tmp1 = m_Vec0.m_Vec;
	__m128 row0 = m_Vec0.m_Vec;
	__m128 row1 = m_Vec1.m_Vec;
	__m128 row2 = m_Vec2.m_Vec;
	__m128 row3 = m_Vec3.m_Vec;

	tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src)), (__m64*)(src+ 4));
	row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(src+8)), (__m64*)(src+12));
	row0 = _mm_shuffle_ps(tmp1, row1, 0x88);
	row1 = _mm_shuffle_ps(row1, tmp1, 0xDD);
	tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6));
	row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(src+10)), (__m64*)(src+14));
	row2 = _mm_shuffle_ps(tmp1, row3, 0x88);
	row3 = _mm_shuffle_ps(row3, tmp1, 0xDD);

	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row2, row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor0 = _mm_mul_ps(row1, tmp1);
	minor1 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
	minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
	minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E);
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row1, row2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
	minor3 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
	minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
	minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E);
	// -----------------------------------------------	
	tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	row2 = _mm_shuffle_ps(row2, row2, 0x4E);
	minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
	minor2 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
	minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
	minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E);
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row0, row1);	
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
	minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row0, row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
	minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
	minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
	// -----------------------------------------------
	tmp1 = _mm_mul_ps(row0, row2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
	minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
	// -----------------------------------------------
	det = _mm_mul_ps(row0, minor0);
	det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
	det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
	tmp1 = _mm_rcp_ss(det);
	det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)));
	det = _mm_shuffle_ps(det, det, 0x00);
	minor0 = _mm_mul_ps(det, minor0);
	_mm_storel_pi((__m64*)(src), minor0);
	_mm_storeh_pi((__m64*)(src+2), minor0);
	minor1 = _mm_mul_ps(det, minor1);
	_mm_storel_pi((__m64*)(src+4), minor1);
	_mm_storeh_pi((__m64*)(src+6), minor1);
	minor2 = _mm_mul_ps(det, minor2);
	_mm_storel_pi((__m64*)(src+ 8), minor2);
	_mm_storeh_pi((__m64*)(src+10), minor2);
	minor3 = _mm_mul_ps(det, minor3);
	_mm_storel_pi((__m64*)(src+12), minor3);
	_mm_storeh_pi((__m64*)(src+14), minor3);

	return *this;
}
예제 #9
0
// inverted diagonal !!!
void kernel_spotrf_strsv_1x1_lib4(int kmax, float *A, int sda, int *info)
	{
	
	const int lda = 4;
	
	__m128
		zeros, ones,
		a_00,
		b_00_10;
	
	zeros = _mm_set_ss( 0.0 );

	a_00 = _mm_load_ss( &A[0+lda*0] );
	if( _mm_comile_ss ( a_00, zeros ) ) { *info = 1; return; }
	a_00 = _mm_sqrt_ss( a_00 );
	ones = _mm_set_ss( 1.0 );
	a_00 = _mm_div_ss( ones, a_00 );
	_mm_store_ss( &A[0+lda*0], a_00 );
	
	if(kmax<=0)
		return;
	
	// strsv


	a_00 = _mm_shuffle_ps( a_00, a_00, 0 );
	
	int k, kna;
	
	float
		*AA;
	
	AA = A + 1;
	k = 0;

	// clean up unaligned stuff at the beginning
	kna = 3;
	if(kmax<kna)
		kna = kmax;

	for(; k<kna; k++)
		{
		b_00_10 = _mm_load_ss( &AA[lda*0] );

		b_00_10 = _mm_mul_ss( b_00_10, a_00 );
		_mm_store_ss( &AA[lda*0], b_00_10 );

		AA += 1;
		}

	for(; k<kmax-3; k+=4)
		{

		AA += lda*(sda-1);
		
		b_00_10 = _mm_load_ps( &AA[0+lda*0] );

		b_00_10 = _mm_mul_ps( b_00_10, a_00 );
		_mm_store_ps( &AA[0+lda*0], b_00_10 );

		AA += 4;
		
		}

	AA += lda*(sda-1);

	for(; k<kmax; k++)
		{
		b_00_10 = _mm_load_ss( &AA[lda*0] );

		b_00_10 = _mm_mul_ss( b_00_10, a_00 );
		_mm_store_ss( &AA[lda*0], b_00_10 );

		AA += 1;
		}
	
	}
예제 #10
0
int main()
{
	float *arr = get_arr(); // [4, 3, 2, 1]
	float *uarr = get_uarr(); // [5, 4, 3, 2]
	float *arr2 = get_arr2(); // [4, 3, 2, 1]
	float *uarr2 = get_uarr2(); // [5, 4, 3, 2]
	__m128 a = get_a(); // [8, 6, 4, 2]
	__m128 b = get_b(); // [1, 2, 3, 4]

	// Check that test data is like expected.
	Assert(((uintptr_t)arr & 0xF) == 0); // arr must be aligned by 16.
	Assert(((uintptr_t)uarr & 0xF) != 0); // uarr must be unaligned.
	Assert(((uintptr_t)arr2 & 0xF) == 0); // arr must be aligned by 16.
	Assert(((uintptr_t)uarr2 & 0xF) != 0); // uarr must be unaligned.

	// Test that aeq itself works and does not trivially return true on everything.
	Assert(aeq_("",_mm_load_ps(arr), 4.f, 3.f, 2.f, 0.f, false) == false);
#ifdef TEST_M64
	Assert(aeq64(u64castm64(0x22446688AACCEEFFULL), 0xABABABABABABABABULL, false) == false);
#endif
	// SSE1 Load instructions:	
	aeq(_mm_load_ps(arr), 4.f, 3.f, 2.f, 1.f); // 4-wide load from aligned address.
	aeq(_mm_load_ps1(uarr), 2.f, 2.f, 2.f, 2.f); // Load scalar from unaligned address and populate 4-wide.
	aeq(_mm_load_ss(uarr), 0.f, 0.f, 0.f, 2.f); // Load scalar from unaligned address to lowest, and zero all highest.
	aeq(_mm_load1_ps(uarr), 2.f, 2.f, 2.f, 2.f); // _mm_load1_ps == _mm_load_ps1
	aeq(_mm_loadh_pi(a, (__m64*)uarr), 3.f, 2.f, 4.f, 2.f); // Load two highest addresses, preserve two lowest.
	aeq(_mm_loadl_pi(a, (__m64*)uarr), 8.f, 6.f, 3.f, 2.f); // Load two lowest addresses, preserve two highest.
	aeq(_mm_loadr_ps(arr), 1.f, 2.f, 3.f, 4.f); // 4-wide load from an aligned address, but reverse order.
	aeq(_mm_loadu_ps(uarr), 5.f, 4.f, 3.f, 2.f); // 4-wide load from an unaligned address.

	// SSE1 Set instructions:
	aeq(_mm_set_ps(uarr[3], 2.f, 3.f, 4.f), 5.f, 2.f, 3.f, 4.f); // 4-wide set by specifying four immediate or memory operands.
	aeq(_mm_set_ps1(uarr[3]), 5.f, 5.f, 5.f, 5.f); // 4-wide set by specifying one scalar that is expanded.
	aeq(_mm_set_ss(uarr[3]), 0.f, 0.f, 0.f, 5.f); // Set scalar at lowest index, zero all higher.
	aeq(_mm_set1_ps(uarr[3]), 5.f, 5.f, 5.f, 5.f); // _mm_set1_ps == _mm_set_ps1
	aeq(_mm_setr_ps(uarr[3], 2.f, 3.f, 4.f), 4.f, 3.f, 2.f, 5.f); // 4-wide set by specifying four immediate or memory operands, but reverse order.
	aeq(_mm_setzero_ps(), 0.f, 0.f, 0.f, 0.f); // Returns a new zero register.

	// SSE1 Move instructions:
	aeq(_mm_move_ss(a, b), 8.f, 6.f, 4.f, 4.f); // Copy three highest elements from a, and lowest from b.
	aeq(_mm_movehl_ps(a, b), 8.f, 6.f, 1.f, 2.f); // Copy two highest elements from a, and take two highest from b and place them to the two lowest in output.
	aeq(_mm_movelh_ps(a, b), 3.f, 4.f, 4.f, 2.f); // Copy two lowest elements from a, and take two lowest from b and place them to the two highest in output.

	// SSE1 Store instructions:
#ifdef TEST_M64
	/*M64*/*(uint64_t*)uarr = 0xCDCDCDCDCDCDCDCDULL; _mm_maskmove_si64(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xCDEEDDCDCDAA99CDULL); // _mm_maskmove_si64: Conditionally store bytes of a 64-bit value.
	/*M64*/*(uint64_t*)uarr = 0xABABABABABABABABULL;       _m_maskmovq(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xABEEDDABABAA99ABULL); // _m_maskmovq is an alias to _mm_maskmove_si64.
#endif
	_mm_store_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_store_ps: 4-wide store to aligned memory address.
	_mm_store_ps1(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store_ps1: Store lowest scalar to aligned address, duplicating the element 4 times. 
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_store_ss(uarr2, b); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 100.f, 4.f); // _mm_store_ss: Store lowest scalar to unaligned address. Don't adjust higher addresses in memory.
	_mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_store1_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store1_ps == _mm_store_ps1
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storeh_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 8.f, 6.f); // _mm_storeh_pi: Store two highest elements to memory.
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storel_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 4.f, 2.f); // _mm_storel_pi: Store two lowest elements to memory.
	_mm_storer_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 4.f, 6.f, 8.f); // _mm_storer_ps: 4-wide store to aligned memory address, but reverse the elements on output.
	_mm_storeu_ps(uarr2, a); aeq(_mm_loadu_ps(uarr2), 8.f, 6.f, 4.f, 2.f); // _mm_storeu_ps: 4-wide store to unaligned memory address.
#ifdef TEST_M64
	/*M64*/_mm_stream_pi((__m64*)uarr, u64castm64(0x0080FF7F01FEFF40ULL)); Assert(*(uint64_t*)uarr == 0x0080FF7F01FEFF40ULL); // _mm_stream_pi: 2-wide store, but with a non-temporal memory cache hint.
#endif
	_mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_stream_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_stream_ps: 4-wide store, but with a non-temporal memory cache hint.

	// SSE1 Arithmetic instructions:
	aeq(_mm_add_ps(a, b), 9.f, 8.f, 7.f, 6.f); // 4-wide add.
	aeq(_mm_add_ss(a, b), 8.f, 6.f, 4.f, 6.f); // Add lowest element, preserve three highest unchanged from a.
	aeq(_mm_div_ps(a, _mm_set_ps(2.f, 3.f, 8.f, 2.f)), 4.f, 2.f, 0.5f, 1.f); // 4-wide div.
	aeq(_mm_div_ss(a, _mm_set_ps(2.f, 3.f, 8.f, 8.f)), 8.f, 6.f, 4.f, 0.25f); // Div lowest element, preserve three highest unchanged from a.
	aeq(_mm_mul_ps(a, b), 8.f, 12.f, 12.f, 8.f); // 4-wide mul.
	aeq(_mm_mul_ss(a, b), 8.f, 6.f, 4.f, 8.f); // Mul lowest element, preserve three highest unchanged from a.
#ifdef TEST_M64
	__m64 m1 = get_m1();
	/*M64*/aeq64(_mm_mulhi_pu16(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // Multiply u16 channels, and store high parts.
	/*M64*/aeq64(    _m_pmulhuw(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // _m_pmulhuw is an alias to _mm_mulhi_pu16.
	__m64 m2 = get_m2();
	/*M64*/aeq64(_mm_sad_pu8(m1, m2), 0x368ULL); // Compute abs. differences of u8 channels, and sum those up to a single 16-bit scalar.
	/*M64*/aeq64(  _m_psadbw(m1, m2), 0x368ULL); // _m_psadbw is an alias to _mm_sad_pu8.
#endif
	aeq(_mm_sub_ps(a, b), 7.f, 4.f, 1.f, -2.f); // 4-wide sub.
	aeq(_mm_sub_ss(a, b), 8.f, 6.f, 4.f, -2.f); // Sub lowest element, preserve three highest unchanged from a.

	// SSE1 Elementary Math functions:
#ifndef __EMSCRIPTEN__ // TODO: Enable support for this to pass.
	aeq(_mm_rcp_ps(a), 0.124969f, 0.166626f, 0.249939f, 0.499878f); // Compute 4-wide 1/x.
	aeq(_mm_rcp_ss(a), 8.f, 6.f, 4.f, 0.499878f); // Compute 1/x of lowest element, pass higher elements unchanged.
	aeq(_mm_rsqrt_ps(a), 0.353455f, 0.408203f, 0.499878f, 0.706909f); // Compute 4-wide 1/sqrt(x).
	aeq(_mm_rsqrt_ss(a), 8.f, 6.f, 4.f, 0.706909f); // Compute 1/sqrt(x) of lowest element, pass higher elements unchanged.
#endif
	aeq(_mm_sqrt_ps(a), 2.82843f, 2.44949f, 2.f, 1.41421f); // Compute 4-wide sqrt(x).
	aeq(_mm_sqrt_ss(a), 8.f, 6.f, 4.f, 1.41421f); // Compute sqrt(x) of lowest element, pass higher elements unchanged.

	__m128 i1 = get_i1();
	__m128 i2 = get_i2();

	// SSE1 Logical instructions:
#ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these.
	aeqi(_mm_and_ps(i1, i2), 0x83200100, 0x0fecc988, 0x80244021, 0x13458a88); // 4-wide binary AND
	aeqi(_mm_andnot_ps(i1, i2), 0x388a9888, 0xf0021444, 0x7000289c, 0x00121046); // 4-wide binary (!i1) & i2
	aeqi(_mm_or_ps(i1, i2), 0xbfefdba9, 0xffefdfed, 0xf7656bbd, 0xffffdbef); // 4-wide binary OR
	aeqi(_mm_xor_ps(i1, i2), 0x3ccfdaa9, 0xf0031665, 0x77412b9c, 0xecba5167); // 4-wide binary XOR
#endif

	// SSE1 Compare instructions:
	// a = [8, 6, 4, 2], b = [1, 2, 3, 4]
	aeqi(_mm_cmpeq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp ==
	aeqi(_mm_cmpeq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp ==, pass three highest unchanged.
	aeqi(_mm_cmpge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp >=
	aeqi(_mm_cmpge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp >=, pass three highest unchanged.
	aeqi(_mm_cmpgt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp >
	aeqi(_mm_cmpgt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp >, pass three highest unchanged.
	aeqi(_mm_cmple_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp <=
	aeqi(_mm_cmple_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <=, pass three highest unchanged.
	aeqi(_mm_cmplt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp <
	aeqi(_mm_cmplt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <, pass three highest unchanged.
	aeqi(_mm_cmpneq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp !=
	aeqi(_mm_cmpneq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp !=, pass three highest unchanged.
	aeqi(_mm_cmpnge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not >=
	aeqi(_mm_cmpnge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp not >=, pass three highest unchanged.
	aeqi(_mm_cmpngt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not >
	aeqi(_mm_cmpngt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not >, pass three highest unchanged.
	aeqi(_mm_cmpnle_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp not <=
	aeqi(_mm_cmpnle_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <=, pass three highest unchanged.
	aeqi(_mm_cmpnlt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp not <
	aeqi(_mm_cmpnlt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <, pass three highest unchanged.

	__m128 nan1 = get_nan1(); // [NAN, 0, 0, NAN]
	__m128 nan2 = get_nan2(); // [NAN, NAN, 0, 0]
	aeqi(_mm_cmpord_ps(nan1, nan2), 0, 0, 0xFFFFFFFF, 0); // 4-wide test if both operands are not nan.
	aeqi(_mm_cmpord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0); // scalar test if both operands are not nan, pass three highest unchanged.
	// Intel Intrinsics Guide documentation is wrong on _mm_cmpunord_ps and _mm_cmpunord_ss. MSDN is right: http://msdn.microsoft.com/en-us/library/khy6fk1t(v=vs.90).aspx
	aeqi(_mm_cmpunord_ps(nan1, nan2), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide test if one of the operands is nan.
#ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these.
	aeqi(_mm_cmpunord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0xFFFFFFFF); // scalar test if one of the operands is nan, pass three highest unchanged.
#endif

	Assert(_mm_comieq_ss(a, b) == 0); Assert(_mm_comieq_ss(a, a) == 1); // Scalar cmp == of lowest element, return int.
	Assert(_mm_comige_ss(a, b) == 0); Assert(_mm_comige_ss(a, a) == 1); // Scalar cmp >= of lowest element, return int.
	Assert(_mm_comigt_ss(b, a) == 1); Assert(_mm_comigt_ss(a, a) == 0); // Scalar cmp > of lowest element, return int.
	Assert(_mm_comile_ss(b, a) == 0); Assert(_mm_comile_ss(a, a) == 1); // Scalar cmp <= of lowest element, return int.
	Assert(_mm_comilt_ss(a, b) == 1); Assert(_mm_comilt_ss(a, a) == 0); // Scalar cmp < of lowest element, return int.
	Assert(_mm_comineq_ss(a, b) == 1); Assert(_mm_comineq_ss(a, a) == 0); // Scalar cmp != of lowest element, return int.

	// The ucomi versions are identical to comi, except that ucomi signal a FP exception only if one of the input operands is a SNaN, whereas the comi versions signal a FP
	// exception when one of the input operands is either a QNaN or a SNaN.
#ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly.
	Assert(_mm_ucomieq_ss(a, b) == 0); Assert(_mm_ucomieq_ss(a, a) == 1); Assert(_mm_ucomieq_ss(a, nan1) == 1);
#endif
	Assert(_mm_ucomige_ss(a, b) == 0); Assert(_mm_ucomige_ss(a, a) == 1); Assert(_mm_ucomige_ss(a, nan1) == 0);
	Assert(_mm_ucomigt_ss(b, a) == 1); Assert(_mm_ucomigt_ss(a, a) == 0); Assert(_mm_ucomigt_ss(a, nan1) == 0);
	Assert(_mm_ucomile_ss(b, a) == 0); Assert(_mm_ucomile_ss(a, a) == 1); Assert(_mm_ucomile_ss(a, nan1) == 1);
	Assert(_mm_ucomilt_ss(a, b) == 1); Assert(_mm_ucomilt_ss(a, a) == 0); Assert(_mm_ucomilt_ss(a, nan1) == 1);
#ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly.
	Assert(_mm_ucomineq_ss(a, b) == 1); Assert(_mm_ucomineq_ss(a, a) == 0); Assert(_mm_ucomineq_ss(a, nan1) == 0);
#endif

	// SSE1 Convert instructions:
	__m128 c = get_c(); // [1.5, 2.5, 3.5, 4.5]
	__m128 e = get_e(); // [INF, -INF, 2.5, 3.5]
	__m128 f = get_f(); // [-1.5, 1.5, -2.5, -9223372036854775808]
#ifdef TEST_M64
	/*M64*/aeq(_mm_cvt_pi2ps(a, m2), 8.f, 6.f, -19088744.f, 1985229312.f); // 2-way int32 to float conversion to two lowest channels of m128.
	/*M64*/aeq64(_mm_cvt_ps2pi(c), 0x400000004ULL); // 2-way two lowest floats from m128 to integer, return as m64.
#endif
	aeq(_mm_cvtsi32_ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // Convert int to float, store in lowest channel of m128.
	aeq( _mm_cvt_si2ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // _mm_cvt_si2ss is an alias to _mm_cvtsi32_ss.
#ifndef __EMSCRIPTEN__ // TODO: Fix banker's rounding in cvt functions.
	Assert(_mm_cvtss_si32(c) == 4); Assert(_mm_cvtss_si32(e) == 4); // Convert lowest channel of m128 from float to int.
	Assert( _mm_cvt_ss2si(c) == 4); Assert( _mm_cvt_ss2si(e) == 4); // _mm_cvt_ss2si is an alias to _mm_cvtss_si32.
#endif
#ifdef TEST_M64
	/*M64*/aeq(_mm_cvtpi16_ps(m1), 255.f , -32767.f, 4336.f, 14207.f); // 4-way convert int16s to floats, return in a m128.
	/*M64*/aeq(_mm_cvtpi32_ps(a, m1), 8.f, 6.f, 16744449.f, 284178304.f); // 2-way convert int32s to floats, return in two lowest channels of m128, pass two highest unchanged.
	/*M64*/aeq(_mm_cvtpi32x2_ps(m1, m2), -19088744.f, 1985229312.f, 16744449.f, 284178304.f); // 4-way convert int32s from two different m64s to float.
	/*M64*/aeq(_mm_cvtpi8_ps(m1), 16.f, -16.f, 55.f, 127.f); // 4-way convert int8s from lowest end of m64 to float in a m128.
	/*M64*/aeq64(_mm_cvtps_pi16(c), 0x0002000200040004ULL); // 4-way convert floats to int16s in a m64.
	/*M64*/aeq64(_mm_cvtps_pi32(c), 0x0000000400000004ULL); // 2-way convert two lowest floats to int32s in a m64.
	/*M64*/aeq64(_mm_cvtps_pi8(c),  0x0000000002020404ULL); // 4-way convert floats to int8s in a m64, zero higher half of the returned m64.
	/*M64*/aeq(_mm_cvtpu16_ps(m1), 255.f , 32769.f, 4336.f, 14207.f); // 4-way convert uint16s to floats, return in a m128.
	/*M64*/aeq(_mm_cvtpu8_ps(m1), 16.f, 240.f, 55.f, 127.f); // 4-way convert uint8s from lowest end of m64 to float in a m128.
#endif
	aeq(_mm_cvtsi64_ss(c, -9223372036854775808ULL), 1.5f, 2.5f, 3.5f, -9223372036854775808.f); // Convert single int64 to float, store in lowest channel of m128, and pass three higher channel unchanged.
	Assert(_mm_cvtss_f32(c) == 4.5f); // Extract lowest channel of m128 to a plain old float.
	Assert(_mm_cvtss_si64(f) == -9223372036854775808ULL); // Convert lowest channel of m128 from float to int64.
#ifdef TEST_M64
	/*M64*/aeq64(_mm_cvtt_ps2pi(e), 0x0000000200000003ULL); aeq64(_mm_cvtt_ps2pi(f), 0xfffffffe80000000ULL); // Truncating conversion from two lowest floats of m128 to int32s, return in a m64.
#endif
	Assert(_mm_cvttss_si32(e) == 3); // Truncating conversion from the lowest float of a m128 to int32.
	Assert( _mm_cvtt_ss2si(e) == 3); // _mm_cvtt_ss2si is an alias to _mm_cvttss_si32.
#ifdef TEST_M64
	/*M64*/aeq64(_mm_cvttps_pi32(c), 0x0000000300000004ULL); // Truncating conversion from two lowest floats of m128 to m64.
#endif
	Assert(_mm_cvttss_si64(f) == -9223372036854775808ULL); // Truncating conversion from lowest channel of m128 from float to int64.

#ifndef __EMSCRIPTEN__ // TODO: Not implemented.
	// SSE1 General support:
	unsigned int mask = _MM_GET_EXCEPTION_MASK();
	_MM_SET_EXCEPTION_MASK(mask);
	unsigned int flushZeroMode = _MM_GET_FLUSH_ZERO_MODE();
	_MM_SET_FLUSH_ZERO_MODE(flushZeroMode);
	unsigned int roundingMode = _MM_GET_ROUNDING_MODE();
	_MM_SET_ROUNDING_MODE(roundingMode);
	unsigned int csr = _mm_getcsr();
	_mm_setcsr(csr);
	unsigned char dummyData[4096];
	_mm_prefetch(dummyData, _MM_HINT_T0);
	_mm_prefetch(dummyData, _MM_HINT_T1);
	_mm_prefetch(dummyData, _MM_HINT_T2);
	_mm_prefetch(dummyData, _MM_HINT_NTA);
	_mm_sfence();
#endif

	// SSE1 Misc instructions:
#ifdef TEST_M64
	/*M64*/Assert(_mm_movemask_pi8(m1) == 100); // Return int with eight lowest bits set depending on the highest bits of the 8 uint8 input channels of the m64.
	/*M64*/Assert(     _m_pmovmskb(m1) == 100); // _m_pmovmskb is an alias to _mm_movemask_pi8.
#endif
	Assert(_mm_movemask_ps(_mm_set_ps(-1.f, 0.f, 1.f, NAN)) == 8); Assert(_mm_movemask_ps(_mm_set_ps(-INFINITY, -0.f, INFINITY, -INFINITY)) == 13); // Return int with four lowest bits set depending on the highest bits of the 4 m128 input channels.

	// SSE1 Probability/Statistics instructions:
#ifdef TEST_M64
	/*M64*/aeq64(_mm_avg_pu16(m1, m2), 0x7FEE9D4D43A234C8ULL); // 4-way average uint16s.
	/*M64*/aeq64(    _m_pavgw(m1, m2), 0x7FEE9D4D43A234C8ULL); // _m_pavgw is an alias to _mm_avg_pu16.
	/*M64*/aeq64(_mm_avg_pu8(m1, m2),  0x7FEE9D4D43A23548ULL); // 8-way average uint8s.
	/*M64*/aeq64(   _m_pavgb(m1, m2),  0x7FEE9D4D43A23548ULL); // _m_pavgb is an alias to _mm_avg_pu8.

	// SSE1 Special Math instructions:
	/*M64*/aeq64(_mm_max_pi16(m1, m2), 0xFFBA987654377FULL); // 4-way average uint16s.
	/*M64*/aeq64(   _m_pmaxsw(m1, m2), 0xFFBA987654377FULL); // _m_pmaxsw is an alias to _mm_max_pi16.
	/*M64*/aeq64(_mm_max_pu8(m1, m2), 0xFEFFBA9876F0377FULL); // 4-way average uint16s.
	/*M64*/aeq64(  _m_pmaxub(m1, m2), 0xFEFFBA9876F0377FULL); // _m_pmaxub is an alias to _mm_max_pu8.
	/*M64*/aeq64(_mm_min_pi16(m1, m2), 0xFEDC800110F03210ULL); // 4-way average uint16s.
	/*M64*/aeq64(   _m_pminsw(m1, m2), 0xFEDC800110F03210ULL); // is an alias to _mm_min_pi16.
	/*M64*/aeq64(_mm_min_pu8(m1, m2), 0xDC800110543210ULL); // 4-way average uint16s.
	/*M64*/aeq64(  _m_pminub(m1, m2), 0xDC800110543210ULL); // is an alias to _mm_min_pu8.
#endif
	// a = [8, 6, 4, 2], b = [1, 2, 3, 4]
	aeq(_mm_max_ps(a, b), 8.f, 6.f, 4.f, 4.f); // 4-wide max.
	aeq(_mm_max_ss(a, _mm_set1_ps(100.f)), 8.f, 6.f, 4.f, 100.f); // Scalar max, pass three highest unchanged.
	aeq(_mm_min_ps(a, b), 1.f, 2.f, 3.f, 2.f); // 4-wide min.
	aeq(_mm_min_ss(a, _mm_set1_ps(-100.f)), 8.f, 6.f, 4.f, -100.f); // Scalar min, pass three highest unchanged.

	// SSE1 Swizzle instructions:
#ifdef TEST_M64
	/*M64*/Assert(_mm_extract_pi16(m1, 1) == 4336); // Extract the given int16 channel from a m64.
	/*M64*/Assert(       _m_pextrw(m1, 1) == 4336); // _m_pextrw is an alias to _mm_extract_pi16.
	/*M64*/aeq64(_mm_insert_pi16(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // Insert a int16 to a specific channel of a m64.
	/*M64*/aeq64(      _m_pinsrw(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // _m_pinsrw is an alias to _mm_insert_pi16.
	/*M64*/aeq64(_mm_shuffle_pi16(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // Shuffle int16s around in the 4 channels of the m64.
	/*M64*/aeq64(       _m_pshufw(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // _m_pshufw is an alias to _mm_shuffle_pi16.
#endif
	aeq(_mm_shuffle_ps(a, b, _MM_SHUFFLE(1, 0, 3, 2)), 3.f, 4.f, 8.f, 6.f);
	aeq(_mm_unpackhi_ps(a, b), 1.f , 8.f, 2.f, 6.f);
	aeq(_mm_unpacklo_ps(a, b), 3.f , 4.f, 4.f, 2.f);

	// Transposing a matrix via the xmmintrin.h-provided intrinsic.
	__m128 c0 = a; // [8, 6, 4, 2]
	__m128 c1 = b; // [1, 2, 3, 4]
	__m128 c2 = get_c(); // [1.5, 2.5, 3.5, 4.5]
	__m128 c3 = get_d(); // [8.5, 6.5, 4.5, 2.5]
	_MM_TRANSPOSE4_PS(c0, c1, c2, c3);
	aeq(c0, 2.5f, 4.5f, 4.f, 2.f);
	aeq(c1, 4.5f, 3.5f, 3.f, 4.f);
	aeq(c2, 6.5f, 2.5f, 2.f, 6.f);
	aeq(c3, 8.5f, 1.5f, 1.f, 8.f);

	// All done!
	if (numFailures == 0)
		printf("Success!\n");
	else
		printf("%d tests failed!\n", numFailures);
}
예제 #11
0
void kernel_spotrf_strsv_4x4_lib4(int kmax, int kinv, float *A, int sda, int *info)
	{
	
	const int lda = 4;
	

	__m128
		zeros, ones, ab_temp,
		a_00, a_10, a_20, a_30, a_11, a_21, a_31, a_22, a_32, a_33,
		b_00_10, b_01_11, b_02_12, b_03_13;
	
	zeros = _mm_set_ss( 0.0 );

	if(kinv==0)
		{
		
		a_00 = _mm_load_ss( &A[0+lda*0] );
		if( _mm_comile_ss ( a_00, zeros ) ) { *info = 1; return; }
		a_00 = _mm_sqrt_ss( a_00 );
		ones = _mm_set_ss( 1.0 );
		_mm_store_ss( &A[0+lda*0], a_00 );
		a_00 = _mm_div_ss( ones, a_00 );
		a_10 = _mm_load_ss( &A[1+lda*0] );
		a_20 = _mm_load_ss( &A[2+lda*0] );
		a_30 = _mm_load_ss( &A[3+lda*0] );
		a_10 = _mm_mul_ss( a_10, a_00 );
		a_20 = _mm_mul_ss( a_20, a_00 );
		a_30 = _mm_mul_ss( a_30, a_00 );
		_mm_store_ss( &A[1+lda*0], a_10 );
		_mm_store_ss( &A[2+lda*0], a_20 );
		_mm_store_ss( &A[3+lda*0], a_30 );
	
		a_11 = _mm_load_ss( &A[1+lda*1] );
		ab_temp = _mm_mul_ss( a_10, a_10 );
		a_11 = _mm_sub_ss( a_11, ab_temp );
		if( _mm_comile_ss ( a_11, zeros ) ) { *info = 1; return; }
		a_11 = _mm_sqrt_ss( a_11 );
		_mm_store_ss( &A[1+lda*1], a_11 );
		a_11 = _mm_div_ss( ones, a_11 );
		a_21 = _mm_load_ss( &A[2+lda*1] );
		a_31 = _mm_load_ss( &A[3+lda*1] );
		ab_temp = _mm_mul_ss( a_20, a_10 );
		a_21 = _mm_sub_ss( a_21, ab_temp );
		ab_temp = _mm_mul_ss( a_30, a_10 );
		a_31 = _mm_sub_ss( a_31, ab_temp );
		a_21 = _mm_mul_ss( a_21, a_11 );
		a_31 = _mm_mul_ss( a_31, a_11 );
		_mm_store_ss( &A[2+lda*1], a_21 );
		_mm_store_ss( &A[3+lda*1], a_31 );
	
		a_22 = _mm_load_ss( &A[2+lda*2] );
		ab_temp = _mm_mul_ss( a_20, a_20 );
		a_22 = _mm_sub_ss( a_22, ab_temp );
		ab_temp = _mm_mul_ss( a_21, a_21 );
		a_22 = _mm_sub_ss( a_22, ab_temp );
		if( _mm_comile_ss ( a_22, zeros ) ) { *info = 1; return; }
		a_22 = _mm_sqrt_ss( a_22 );
		_mm_store_ss( &A[2+lda*2], a_22 );
		a_22 = _mm_div_ss( ones, a_22 );
		a_32 = _mm_load_ss( &A[3+lda*2] );
		ab_temp = _mm_mul_ss( a_30, a_20 );
		a_32 = _mm_sub_ss( a_32, ab_temp );
		ab_temp = _mm_mul_ss( a_31, a_21 );
		a_32 = _mm_sub_ss( a_32, ab_temp );
		a_32 = _mm_mul_ss( a_32, a_22 );
		_mm_store_ss( &A[3+lda*2], a_32 );
			
		a_33 = _mm_load_ss( &A[3+lda*3] );
		ab_temp = _mm_mul_ss( a_30, a_30 );
		a_33 = _mm_sub_ss( a_33, ab_temp );
		ab_temp = _mm_mul_ss( a_31, a_31 );
		a_33 = _mm_sub_ss( a_33, ab_temp );
		ab_temp = _mm_mul_ss( a_32, a_32 );
		a_33 = _mm_sub_ss( a_33, ab_temp );
		if( _mm_comile_ss ( a_33, zeros ) ) { *info = 1; return; }
		a_33 = _mm_sqrt_ss( a_33 );
		_mm_store_ss( &A[3+lda*3], a_33 );
		if(kmax>0)
			a_33 = _mm_div_ss( ones, a_33 );

		}
	else // kinv == {1, 2, 3}
		{		

		a_00 = _mm_load_ss( &A[0+lda*0] );
		if( _mm_comile_ss ( a_00, zeros ) ) { *info = 1; return; }
		a_00 = _mm_sqrt_ss( a_00 );
		ones = _mm_set_ss( 1.0 );
		a_00 = _mm_div_ss( ones, a_00 );
		_mm_store_ss( &A[0+lda*0], a_00 );
		a_10 = _mm_load_ss( &A[1+lda*0] );
		a_20 = _mm_load_ss( &A[2+lda*0] );
		a_30 = _mm_load_ss( &A[3+lda*0] );
		a_10 = _mm_mul_ss( a_10, a_00 );
		a_20 = _mm_mul_ss( a_20, a_00 );
		a_30 = _mm_mul_ss( a_30, a_00 );
		_mm_store_ss( &A[1+lda*0], a_10 );
		_mm_store_ss( &A[2+lda*0], a_20 );
		_mm_store_ss( &A[3+lda*0], a_30 );
	
		a_11 = _mm_load_ss( &A[1+lda*1] );
		ab_temp = _mm_mul_ss( a_10, a_10 );
		a_11 = _mm_sub_ss( a_11, ab_temp );
		if( _mm_comile_ss ( a_11, zeros ) ) { *info = 1; return; }
		a_11 = _mm_sqrt_ss( a_11 );
		if(kinv<=1)
			{
			_mm_store_ss( &A[1+lda*1], a_11 );
			}
		a_11 = _mm_div_ss( ones, a_11 );
		if(kinv>1)
			_mm_store_ss( &A[1+lda*1], a_11 );
		a_21 = _mm_load_ss( &A[2+lda*1] );
		a_31 = _mm_load_ss( &A[3+lda*1] );
		ab_temp = _mm_mul_ss( a_20, a_10 );
		a_21 = _mm_sub_ss( a_21, ab_temp );
		ab_temp = _mm_mul_ss( a_30, a_10 );
		a_31 = _mm_sub_ss( a_31, ab_temp );
		a_21 = _mm_mul_ss( a_21, a_11 );
		a_31 = _mm_mul_ss( a_31, a_11 );
		_mm_store_ss( &A[2+lda*1], a_21 );
		_mm_store_ss( &A[3+lda*1], a_31 );
	
		a_22 = _mm_load_ss( &A[2+lda*2] );
		ab_temp = _mm_mul_ss( a_20, a_20 );
		a_22 = _mm_sub_ss( a_22, ab_temp );
		ab_temp = _mm_mul_ss( a_21, a_21 );
		a_22 = _mm_sub_ss( a_22, ab_temp );
		if( _mm_comile_ss ( a_22, zeros ) ) { *info = 1; return; }
		a_22 = _mm_sqrt_ss( a_22 );
		if(kinv<=2)
			{
			_mm_store_ss( &A[2+lda*2], a_22 );
			}
		a_22 = _mm_div_ss( ones, a_22 );
		if(kinv>2)
			_mm_store_ss( &A[2+lda*2], a_22 );
		a_32 = _mm_load_ss( &A[3+lda*2] );
		ab_temp = _mm_mul_ss( a_30, a_20 );
		a_32 = _mm_sub_ss( a_32, ab_temp );
		ab_temp = _mm_mul_ss( a_31, a_21 );
		a_32 = _mm_sub_ss( a_32, ab_temp );
		a_32 = _mm_mul_ss( a_32, a_22 );
		_mm_store_ss( &A[3+lda*2], a_32 );
		
		a_33 = _mm_load_ss( &A[3+lda*3] );
		ab_temp = _mm_mul_ss( a_30, a_30 );
		a_33 = _mm_sub_ss( a_33, ab_temp );
		ab_temp = _mm_mul_ss( a_31, a_31 );
		a_33 = _mm_sub_ss( a_33, ab_temp );
		ab_temp = _mm_mul_ss( a_32, a_32 );
		a_33 = _mm_sub_ss( a_33, ab_temp );
		if( _mm_comile_ss ( a_33, zeros ) ) { *info = 1; return; }
		a_33 = _mm_sqrt_ss( a_33 );
		_mm_store_ss( &A[3+lda*3], a_33 );
		if(kinv<=3)
			{
			_mm_store_ss( &A[3+lda*3], a_33 );
			}
		a_33 = _mm_div_ss( ones, a_33 );
		if(kinv>3)
			_mm_store_ss( &A[3+lda*3], a_33 );

		}

	
	if(kmax<=0)
		return;
	
	// strsv

/*	a_33 = _mm_div_ss( ones, a_33 );*/

	a_00 = _mm_shuffle_ps( a_00, a_00, 0 );
	a_10 = _mm_shuffle_ps( a_10, a_10, 0 );
	a_20 = _mm_shuffle_ps( a_20, a_20, 0 );
	a_30 = _mm_shuffle_ps( a_30, a_30, 0 );
	a_11 = _mm_shuffle_ps( a_11, a_11, 0 );
	a_21 = _mm_shuffle_ps( a_21, a_21, 0 );
	a_31 = _mm_shuffle_ps( a_31, a_31, 0 );
	a_22 = _mm_shuffle_ps( a_22, a_22, 0 );
	a_32 = _mm_shuffle_ps( a_32, a_32, 0 );
	a_33 = _mm_shuffle_ps( a_33, a_33, 0 );
	
	int k;
	
	float
		*AA;
	
	AA = A+4;
	k = 0;
	for(; k<kmax-3; k+=4)
		{

		AA += lda*(sda-1);
		
		b_00_10 = _mm_load_ps( &AA[0+lda*0] );
		b_01_11 = _mm_load_ps( &AA[0+lda*1] );
		b_02_12 = _mm_load_ps( &AA[0+lda*2] );
		b_03_13 = _mm_load_ps( &AA[0+lda*3] );

		b_00_10 = _mm_mul_ps( b_00_10, a_00 );
		_mm_store_ps( &AA[0+lda*0], b_00_10 );

		ab_temp = _mm_mul_ps( b_00_10, a_10 );
		b_01_11 = _mm_sub_ps( b_01_11, ab_temp );
		b_01_11 = _mm_mul_ps( b_01_11, a_11 );
		_mm_store_ps( &AA[0+lda*1], b_01_11 );

		ab_temp = _mm_mul_ps( b_00_10, a_20 );
		b_02_12 = _mm_sub_ps( b_02_12, ab_temp );
		ab_temp = _mm_mul_ps( b_01_11, a_21 );
		b_02_12 = _mm_sub_ps( b_02_12, ab_temp );
		b_02_12 = _mm_mul_ps( b_02_12, a_22 );
		_mm_store_ps( &AA[0+lda*2], b_02_12 );

		ab_temp = _mm_mul_ps( b_00_10, a_30 );
		b_03_13 = _mm_sub_ps( b_03_13, ab_temp );
		ab_temp = _mm_mul_ps( b_01_11, a_31 );
		b_03_13 = _mm_sub_ps( b_03_13, ab_temp );
		ab_temp = _mm_mul_ps( b_02_12, a_32 );
		b_03_13 = _mm_sub_ps( b_03_13, ab_temp );
		b_03_13 = _mm_mul_ps( b_03_13, a_33 );
		_mm_store_ps( &AA[0+lda*3], b_03_13 );

		AA += 4;
		
		}

	AA += lda*(sda-1);

	for(; k<kmax; k++)
		{
		b_00_10 = _mm_load_ss( &AA[lda*0] );
		b_01_11 = _mm_load_ss( &AA[lda*1] );
		b_02_12 = _mm_load_ss( &AA[lda*2] );
		b_03_13 = _mm_load_ss( &AA[lda*3] );

		b_00_10 = _mm_mul_ss( b_00_10, a_00 );
		_mm_store_ss( &AA[lda*0], b_00_10 );
	
		ab_temp = _mm_mul_ss( b_00_10, a_10 );
		b_01_11 = _mm_sub_ss( b_01_11, ab_temp );
		b_01_11 = _mm_mul_ss( b_01_11, a_11 );
		_mm_store_ss( &AA[lda*1], b_01_11 );

		ab_temp = _mm_mul_ss( b_00_10, a_20 );
		b_02_12 = _mm_sub_ss( b_02_12, ab_temp );
		ab_temp = _mm_mul_ss( b_01_11, a_21 );
		b_02_12 = _mm_sub_ss( b_02_12, ab_temp );
		b_02_12 = _mm_mul_ss( b_02_12, a_22 );
		_mm_store_ss( &AA[lda*2], b_02_12 );

		ab_temp = _mm_mul_ss( b_00_10, a_30 );
		b_03_13 = _mm_sub_ss( b_03_13, ab_temp );
		ab_temp = _mm_mul_ss( b_01_11, a_31 );
		b_03_13 = _mm_sub_ss( b_03_13, ab_temp );
		ab_temp = _mm_mul_ss( b_02_12, a_32 );
		b_03_13 = _mm_sub_ss( b_03_13, ab_temp );
		b_03_13 = _mm_mul_ss( b_03_13, a_33 );
		_mm_store_ss( &AA[lda*3], b_03_13 );

		AA += 1;
		}
	
	}
예제 #12
0
// inverted diagonal !!!
void kernel_spotrf_strsv_3x3_lib4(int kmax, float *A, int sda, int *info)
	{
	
	const int lda = 4;
	
	__m128
		zeros, ones, ab_temp,
		a_00, a_10, a_20, a_11, a_21, a_22,
		b_00_10, b_01_11, b_02_12;
	
	zeros = _mm_set_ss( 0.0 );

	a_00 = _mm_load_ss( &A[0+lda*0] );
	if( _mm_comile_ss ( a_00, zeros ) ) { *info = 1; return; }
	a_00 = _mm_sqrt_ss( a_00 );
	ones = _mm_set_ss( 1.0 );
	a_00 = _mm_div_ss( ones, a_00 );
	_mm_store_ss( &A[0+lda*0], a_00 );
	a_10 = _mm_load_ss( &A[1+lda*0] );
	a_20 = _mm_load_ss( &A[2+lda*0] );
	a_10 = _mm_mul_ss( a_10, a_00 );
	a_20 = _mm_mul_ss( a_20, a_00 );
	_mm_store_ss( &A[1+lda*0], a_10 );
	_mm_store_ss( &A[2+lda*0], a_20 );
	
	a_11 = _mm_load_ss( &A[1+lda*1] );
	ab_temp = _mm_mul_ss( a_10, a_10 );
	a_11 = _mm_sub_ss( a_11, ab_temp );
	if( _mm_comile_ss ( a_11, zeros ) ) { *info = 1; return; }
	a_11 = _mm_sqrt_ss( a_11 );
	a_11 = _mm_div_ss( ones, a_11 );
	_mm_store_ss( &A[1+lda*1], a_11 );
	a_21 = _mm_load_ss( &A[2+lda*1] );
	ab_temp = _mm_mul_ss( a_20, a_10 );
	a_21 = _mm_sub_ss( a_21, ab_temp );
	a_21 = _mm_mul_ss( a_21, a_11 );
	_mm_store_ss( &A[2+lda*1], a_21 );
	
	a_22 = _mm_load_ss( &A[2+lda*2] );
	ab_temp = _mm_mul_ss( a_20, a_20 );
	a_22 = _mm_sub_ss( a_22, ab_temp );
	ab_temp = _mm_mul_ss( a_21, a_21 );
	a_22 = _mm_sub_ss( a_22, ab_temp );
	if( _mm_comile_ss ( a_22, zeros ) ) { *info = 1; return; }
	a_22 = _mm_sqrt_ss( a_22 );
	a_22 = _mm_div_ss( ones, a_22 );
	_mm_store_ss( &A[2+lda*2], a_22 );

	
	if(kmax<=0)
		return;
	
	// strsv


	a_00 = _mm_shuffle_ps( a_00, a_00, 0 );
	a_10 = _mm_shuffle_ps( a_10, a_10, 0 );
	a_20 = _mm_shuffle_ps( a_20, a_20, 0 );
	a_11 = _mm_shuffle_ps( a_11, a_11, 0 );
	a_21 = _mm_shuffle_ps( a_21, a_21, 0 );
	a_22 = _mm_shuffle_ps( a_22, a_22, 0 );
	
	int k, kna;
	
	float
		*AA;
	
	AA = A + 3;
	k = 0;

	// clean up unaligned stuff at the beginning
	kna = 1;
	if(kmax<kna)
		kna = kmax;

	for(; k<kna; k++)
		{
		b_00_10 = _mm_load_ss( &AA[lda*0] );
		b_01_11 = _mm_load_ss( &AA[lda*1] );
		b_02_12 = _mm_load_ss( &AA[lda*2] );

		b_00_10 = _mm_mul_ss( b_00_10, a_00 );
		_mm_store_ss( &AA[lda*0], b_00_10 );
	
		ab_temp = _mm_mul_ss( b_00_10, a_10 );
		b_01_11 = _mm_sub_ss( b_01_11, ab_temp );
		b_01_11 = _mm_mul_ss( b_01_11, a_11 );
		_mm_store_ss( &AA[lda*1], b_01_11 );

		ab_temp = _mm_mul_ss( b_00_10, a_20 );
		b_02_12 = _mm_sub_ss( b_02_12, ab_temp );
		ab_temp = _mm_mul_ss( b_01_11, a_21 );
		b_02_12 = _mm_sub_ss( b_02_12, ab_temp );
		b_02_12 = _mm_mul_ss( b_02_12, a_22 );
		_mm_store_ss( &AA[lda*2], b_02_12 );

		AA += 1;
		}

	for(; k<kmax-3; k+=4)
		{

		AA += lda*(sda-1);
		
		b_00_10 = _mm_load_ps( &AA[0+lda*0] );
		b_01_11 = _mm_load_ps( &AA[0+lda*1] );
		b_02_12 = _mm_load_ps( &AA[0+lda*2] );

		b_00_10 = _mm_mul_ps( b_00_10, a_00 );
		_mm_store_ps( &AA[0+lda*0], b_00_10 );

		ab_temp = _mm_mul_ps( b_00_10, a_10 );
		b_01_11 = _mm_sub_ps( b_01_11, ab_temp );
		b_01_11 = _mm_mul_ps( b_01_11, a_11 );
		_mm_store_ps( &AA[0+lda*1], b_01_11 );

		ab_temp = _mm_mul_ps( b_00_10, a_20 );
		b_02_12 = _mm_sub_ps( b_02_12, ab_temp );
		ab_temp = _mm_mul_ps( b_01_11, a_21 );
		b_02_12 = _mm_sub_ps( b_02_12, ab_temp );
		b_02_12 = _mm_mul_ps( b_02_12, a_22 );
		_mm_store_ps( &AA[0+lda*2], b_02_12 );

		AA += 4;
		
		}

	AA += lda*(sda-1);

	for(; k<kmax; k++)
		{
		b_00_10 = _mm_load_ss( &AA[lda*0] );
		b_01_11 = _mm_load_ss( &AA[lda*1] );
		b_02_12 = _mm_load_ss( &AA[lda*2] );

		b_00_10 = _mm_mul_ss( b_00_10, a_00 );
		_mm_store_ss( &AA[lda*0], b_00_10 );
	
		ab_temp = _mm_mul_ss( b_00_10, a_10 );
		b_01_11 = _mm_sub_ss( b_01_11, ab_temp );
		b_01_11 = _mm_mul_ss( b_01_11, a_11 );
		_mm_store_ss( &AA[lda*1], b_01_11 );

		ab_temp = _mm_mul_ss( b_00_10, a_20 );
		b_02_12 = _mm_sub_ss( b_02_12, ab_temp );
		ab_temp = _mm_mul_ss( b_01_11, a_21 );
		b_02_12 = _mm_sub_ss( b_02_12, ab_temp );
		b_02_12 = _mm_mul_ss( b_02_12, a_22 );
		_mm_store_ss( &AA[lda*2], b_02_12 );

		AA += 1;
		}
	
	}
예제 #13
0
void kernel_ssymv_4_lib8_old(int kmax, int kna, float *A, int sda, float *x_n, float *y_n, float *x_t, float *y_t, int tri, int alg)
	{
	
	if(kmax<=0) 
		return;
	
	const int lda = 8;
	
	int k;
	
	__m128
		zeros, temp,
		a_00, a_01, a_02, a_03,
		x_n_0, x_n_1, x_n_2, x_n_3, y_n_0,
		x_t_0, y_t_0, y_t_1, y_t_2, y_t_3;
	
	zeros = _mm_setzero_ps();

	x_n_0 = _mm_broadcast_ss( &x_n[0] );
	x_n_1 = _mm_broadcast_ss( &x_n[1] );
	x_n_2 = _mm_broadcast_ss( &x_n[2] );
	x_n_3 = _mm_broadcast_ss( &x_n[3] );

	if(alg==-1)
		{
		x_n_0 = _mm_sub_ps( zeros, x_n_0 );
		x_n_1 = _mm_sub_ps( zeros, x_n_1 );
		x_n_2 = _mm_sub_ps( zeros, x_n_2 );
		x_n_3 = _mm_sub_ps( zeros, x_n_3 );
		}

	y_t_0 = _mm_setzero_ps();
	y_t_1 = _mm_setzero_ps();
	y_t_2 = _mm_setzero_ps();
	y_t_3 = _mm_setzero_ps();
	
	k=0;

	// corner
	if(tri==1)
		{
		
		y_n_0 = _mm_load_ss( &y_n[0] );
		x_t_0 = _mm_load_ss( &x_t[0] );
		
		a_00  = _mm_load_ss( &A[0+lda*0] );
		a_01  = _mm_load_ss( &A[0+lda*1] );
		a_02  = _mm_load_ss( &A[0+lda*2] );
		a_03  = _mm_load_ss( &A[0+lda*3] );
		
/*		temp  = _mm_mul_ss( a_00, x_n_0 );*/
/*		y_n_0 = _mm_add_ss( y_n_0, temp );*/
		temp  = _mm_mul_ss( a_00, x_t_0 );
		y_t_0 = _mm_add_ss( y_t_0, temp );
		temp  = _mm_mul_ss( a_01, x_n_1 );
		y_n_0 = _mm_add_ss( y_n_0, temp );
		temp  = _mm_mul_ss( a_01, x_t_0 );
		y_t_1 = _mm_add_ss( y_t_1, temp );
		temp  = _mm_mul_ss( a_02, x_n_2 );
		y_n_0 = _mm_add_ss( y_n_0, temp );
		temp  = _mm_mul_ss( a_02, x_t_0 );
		y_t_2 = _mm_add_ss( y_t_2, temp );
		temp  = _mm_mul_ss( a_03, x_n_3 );
		y_n_0 = _mm_add_ss( y_n_0, temp );
		temp  = _mm_mul_ss( a_03, x_t_0 );
		y_t_3 = _mm_add_ss( y_t_3, temp );
		
		_mm_store_ss( &y_n[0], y_n_0 );


		y_n_0 = _mm_load_ss( &y_n[1] );
		x_t_0 = _mm_load_ss( &x_t[1] );
		
/*		a_00  = _mm_load_ss( &A[1+lda*0] );*/
		a_01  = _mm_load_ss( &A[1+lda*1] );
		a_02  = _mm_load_ss( &A[1+lda*2] );
		a_03  = _mm_load_ss( &A[1+lda*3] );
		
/*		temp  = _mm_mul_ss( a_00, x_n_0 );*/
/*		y_n_0 = _mm_add_ss( y_n_0, temp );*/
/*		temp  = _mm_mul_ss( a_00, x_t_0 );*/
/*		y_t_0 = _mm_add_ss( y_t_0, temp );*/
/*		temp  = _mm_mul_ss( a_01, x_n_1 );*/
/*		y_n_0 = _mm_add_ss( y_n_0, temp );*/
		temp  = _mm_mul_ss( a_01, x_t_0 );
		y_t_1 = _mm_add_ss( y_t_1, temp );
		temp  = _mm_mul_ss( a_02, x_n_2 );
		y_n_0 = _mm_add_ss( y_n_0, temp );
		temp  = _mm_mul_ss( a_02, x_t_0 );
		y_t_2 = _mm_add_ss( y_t_2, temp );
		temp  = _mm_mul_ss( a_03, x_n_3 );
		y_n_0 = _mm_add_ss( y_n_0, temp );
		temp  = _mm_mul_ss( a_03, x_t_0 );
		y_t_3 = _mm_add_ss( y_t_3, temp );
		
		_mm_store_ss( &y_n[1], y_n_0 );


		y_n_0 = _mm_load_ss( &y_n[2] );
		x_t_0 = _mm_load_ss( &x_t[2] );
		
/*		a_00  = _mm_load_ss( &A[2+lda*0] );*/
/*		a_01  = _mm_load_ss( &A[2+lda*1] );*/
		a_02  = _mm_load_ss( &A[2+lda*2] );
		a_03  = _mm_load_ss( &A[2+lda*3] );
		
/*		temp  = _mm_mul_ss( a_00, x_n_0 );*/
/*		y_n_0 = _mm_add_ss( y_n_0, temp );*/
/*		temp  = _mm_mul_ss( a_00, x_t_0 );*/
/*		y_t_0 = _mm_add_ss( y_t_0, temp );*/
/*		temp  = _mm_mul_ss( a_01, x_n_1 );*/
/*		y_n_0 = _mm_add_ss( y_n_0, temp );*/
/*		temp  = _mm_mul_ss( a_01, x_t_0 );*/
/*		y_t_1 = _mm_add_ss( y_t_1, temp );*/
/*		temp  = _mm_mul_ss( a_02, x_n_2 );*/
/*		y_n_0 = _mm_add_ss( y_n_0, temp );*/
		temp  = _mm_mul_ss( a_02, x_t_0 );
		y_t_2 = _mm_add_ss( y_t_2, temp );
		temp  = _mm_mul_ss( a_03, x_n_3 );
		y_n_0 = _mm_add_ss( y_n_0, temp );
		temp  = _mm_mul_ss( a_03, x_t_0 );
		y_t_3 = _mm_add_ss( y_t_3, temp );
		
		_mm_store_ss( &y_n[2], y_n_0 );

		
		y_n_0 = _mm_load_ss( &y_n[3] );
		x_t_0 = _mm_load_ss( &x_t[3] );
		
/*		a_00  = _mm_load_ss( &A[3+lda*0] );*/
/*		a_01  = _mm_load_ss( &A[3+lda*1] );*/
/*		a_02  = _mm_load_ss( &A[3+lda*2] );*/
		a_03  = _mm_load_ss( &A[3+lda*3] );
		
/*		temp  = _mm_mul_ss( a_00, x_n_0 );*/
/*		y_n_0 = _mm_add_ss( y_n_0, temp );*/
/*		temp  = _mm_mul_ss( a_00, x_t_0 );*/
/*		y_t_0 = _mm_add_ss( y_t_0, temp );*/
/*		temp  = _mm_mul_ss( a_01, x_n_1 );*/
/*		y_n_0 = _mm_add_ss( y_n_0, temp );*/
/*		temp  = _mm_mul_ss( a_01, x_t_0 );*/
/*		y_t_1 = _mm_add_ss( y_t_1, temp );*/
/*		temp  = _mm_mul_ss( a_02, x_n_2 );*/
/*		y_n_0 = _mm_add_ss( y_n_0, temp );*/
/*		temp  = _mm_mul_ss( a_02, x_t_0 );*/
/*		y_t_2 = _mm_add_ss( y_t_2, temp );*/
/*		temp  = _mm_mul_ss( a_03, x_n_3 );*/
/*		y_n_0 = _mm_add_ss( y_n_0, temp );*/
		temp  = _mm_mul_ss( a_03, x_t_0 );
		y_t_3 = _mm_add_ss( y_t_3, temp );
		
		_mm_store_ss( &y_n[3], y_n_0 );
		

		A   += 4;
		y_n += 4;
		x_t += 4;

		k += 4;

		}
	for(; k<kna; k++)
		{
		
		y_n_0 = _mm_load_ss( &y_n[0] );
		x_t_0 = _mm_load_ss( &x_t[0] );
		
		a_00  = _mm_load_ss( &A[0+lda*0] );
		a_01  = _mm_load_ss( &A[0+lda*1] );
		a_02  = _mm_load_ss( &A[0+lda*2] );
		a_03  = _mm_load_ss( &A[0+lda*3] );
		
		temp  = _mm_mul_ss( a_00, x_n_0 );
		y_n_0 = _mm_add_ss( y_n_0, temp );
		temp  = _mm_mul_ss( a_00, x_t_0 );
		y_t_0 = _mm_add_ss( y_t_0, temp );
		temp  = _mm_mul_ss( a_01, x_n_1 );
		y_n_0 = _mm_add_ss( y_n_0, temp );
		temp  = _mm_mul_ss( a_01, x_t_0 );
		y_t_1 = _mm_add_ss( y_t_1, temp );
		temp  = _mm_mul_ss( a_02, x_n_2 );
		y_n_0 = _mm_add_ss( y_n_0, temp );
		temp  = _mm_mul_ss( a_02, x_t_0 );
		y_t_2 = _mm_add_ss( y_t_2, temp );
		temp  = _mm_mul_ss( a_03, x_n_3 );
		y_n_0 = _mm_add_ss( y_n_0, temp );
		temp  = _mm_mul_ss( a_03, x_t_0 );
		y_t_3 = _mm_add_ss( y_t_3, temp );
		
		_mm_store_ss( &y_n[0], y_n_0 );

	
		A   += 1;
		y_n += 1;
		x_t += 1;
		
		}
	if(kna>0 || tri==1)
		{
		A += (sda-1)*lda;
		}
	for(; k<kmax-7; k+=8)
		{
		
		y_n_0 = _mm_loadu_ps( &y_n[0] );
		x_t_0 = _mm_loadu_ps( &x_t[0] );
		
		a_00  = _mm_load_ps( &A[0+lda*0] );
		a_01  = _mm_load_ps( &A[0+lda*1] );
		a_02  = _mm_load_ps( &A[0+lda*2] );
		a_03  = _mm_load_ps( &A[0+lda*3] );
		
		temp  = _mm_mul_ps( a_00, x_n_0 );
		y_n_0 = _mm_add_ps( y_n_0, temp );
		temp  = _mm_mul_ps( a_00, x_t_0 );
		y_t_0 = _mm_add_ps( y_t_0, temp );
		temp  = _mm_mul_ps( a_01, x_n_1 );
		y_n_0 = _mm_add_ps( y_n_0, temp );
		temp  = _mm_mul_ps( a_01, x_t_0 );
		y_t_1 = _mm_add_ps( y_t_1, temp );
		temp  = _mm_mul_ps( a_02, x_n_2 );
		y_n_0 = _mm_add_ps( y_n_0, temp );
		temp  = _mm_mul_ps( a_02, x_t_0 );
		y_t_2 = _mm_add_ps( y_t_2, temp );
		temp  = _mm_mul_ps( a_03, x_n_3 );
		y_n_0 = _mm_add_ps( y_n_0, temp );
		temp  = _mm_mul_ps( a_03, x_t_0 );
		y_t_3 = _mm_add_ps( y_t_3, temp );
		
		_mm_storeu_ps( &y_n[0], y_n_0 );
		

		y_n_0 = _mm_loadu_ps( &y_n[4] );
		x_t_0 = _mm_loadu_ps( &x_t[4] );
		
		a_00  = _mm_load_ps( &A[4+lda*0] );
		a_01  = _mm_load_ps( &A[4+lda*1] );
		a_02  = _mm_load_ps( &A[4+lda*2] );
		a_03  = _mm_load_ps( &A[4+lda*3] );
		
		temp  = _mm_mul_ps( a_00, x_n_0 );
		y_n_0 = _mm_add_ps( y_n_0, temp );
		temp  = _mm_mul_ps( a_00, x_t_0 );
		y_t_0 = _mm_add_ps( y_t_0, temp );
		temp  = _mm_mul_ps( a_01, x_n_1 );
		y_n_0 = _mm_add_ps( y_n_0, temp );
		temp  = _mm_mul_ps( a_01, x_t_0 );
		y_t_1 = _mm_add_ps( y_t_1, temp );
		temp  = _mm_mul_ps( a_02, x_n_2 );
		y_n_0 = _mm_add_ps( y_n_0, temp );
		temp  = _mm_mul_ps( a_02, x_t_0 );
		y_t_2 = _mm_add_ps( y_t_2, temp );
		temp  = _mm_mul_ps( a_03, x_n_3 );
		y_n_0 = _mm_add_ps( y_n_0, temp );
		temp  = _mm_mul_ps( a_03, x_t_0 );
		y_t_3 = _mm_add_ps( y_t_3, temp );
		
		_mm_storeu_ps( &y_n[4], y_n_0 );
		

		A   += sda*lda;
		y_n += 8;
		x_t += 8;

		}
	
	for(; k<kmax; k++)
		{
		
		y_n_0 = _mm_load_ss( &y_n[0] );
		x_t_0 = _mm_load_ss( &x_t[0] );
		
		a_00  = _mm_load_ss( &A[0+lda*0] );
		a_01  = _mm_load_ss( &A[0+lda*1] );
		a_02  = _mm_load_ss( &A[0+lda*2] );
		a_03  = _mm_load_ss( &A[0+lda*3] );
		
		temp  = _mm_mul_ss( a_00, x_n_0 );
		y_n_0 = _mm_add_ss( y_n_0, temp );
		temp  = _mm_mul_ss( a_00, x_t_0 );
		y_t_0 = _mm_add_ss( y_t_0, temp );
		temp  = _mm_mul_ss( a_01, x_n_1 );
		y_n_0 = _mm_add_ss( y_n_0, temp );
		temp  = _mm_mul_ss( a_01, x_t_0 );
		y_t_1 = _mm_add_ss( y_t_1, temp );
		temp  = _mm_mul_ss( a_02, x_n_2 );
		y_n_0 = _mm_add_ss( y_n_0, temp );
		temp  = _mm_mul_ss( a_02, x_t_0 );
		y_t_2 = _mm_add_ss( y_t_2, temp );
		temp  = _mm_mul_ss( a_03, x_n_3 );
		y_n_0 = _mm_add_ss( y_n_0, temp );
		temp  = _mm_mul_ss( a_03, x_t_0 );
		y_t_3 = _mm_add_ss( y_t_3, temp );
		
		_mm_store_ss( &y_n[0], y_n_0 );

	
		A   += 1;
		y_n += 1;
		x_t += 1;
		
		}

	// reduction
	y_t_0 = _mm_hadd_ps(y_t_0, y_t_1);
	y_t_2 = _mm_hadd_ps(y_t_2, y_t_3);

	y_t_0 = _mm_hadd_ps(y_t_0, y_t_2);

	if(alg==1)
		{
		y_t_1 = _mm_loadu_ps( &y_t[0] );

		y_t_1 = _mm_add_ps(y_t_1, y_t_0);

		_mm_storeu_ps(&y_t[0], y_t_1);
		}
	else // alg==-1
		{
		y_t_1 = _mm_loadu_ps( &y_t[0] );

		y_t_1 = _mm_sub_ps(y_t_1, y_t_0);

		_mm_storeu_ps(&y_t[0], y_t_1);
		}
	
	}
예제 #14
0
void sINLINE RNMarchingCubesBase<T>::func(const sVector31 &v,typename T::FieldType &pot,const funcinfo &fi)
{
  __m128 vx = _mm_load_ps1(&v.x);
  __m128 vy = _mm_load_ps1(&v.y);
  __m128 vz = _mm_load_ps1(&v.z);
  __m128 po = _mm_setzero_ps();           // p
  __m128 nx = _mm_setzero_ps();
  __m128 ny = _mm_setzero_ps();
  __m128 nz = _mm_setzero_ps();
  __m128 akkur = _mm_setzero_ps();
  __m128 akkug = _mm_setzero_ps();
  __m128 akkub = _mm_setzero_ps();
  __m128 akkua = _mm_setzero_ps();
  __m128 s255 = _mm_set_ps1(255.0f);
  
  sBool good = 0;

  for(sInt i=0;i<fi.pn4;i++)
  {
    const T::SimdType *part = fi.parts4 + i;

    __m128 dx = _mm_sub_ps(vx,part->x);
    __m128 dy = _mm_sub_ps(vy,part->y);
    __m128 dz = _mm_sub_ps(vz,part->z);
    __m128 ddx = _mm_mul_ps(dx,dx);
    __m128 ddy = _mm_mul_ps(dy,dy);
    __m128 ddz = _mm_mul_ps(dz,dz);
    __m128 pp = _mm_add_ps(_mm_add_ps(ddx,ddy),ddz);

    if(_mm_movemask_ps(_mm_cmple_ps(pp,fi.treshf4))!=0)
    {
      __m128 pp2 = _mm_sub_ps(_mm_div_ps(fi.one,pp),fi.tresh4);
      __m128 pp3 = _mm_max_ps(pp2,_mm_setzero_ps());
      po = _mm_add_ps(po,pp3);                  // p = p+pp;
      __m128 pp4 = _mm_mul_ps(pp3,pp3);         // pp*pp
      nx = _mm_add_ps(nx,_mm_mul_ps(pp4,dx));   // n += d*(pp*pp)
      ny = _mm_add_ps(ny,_mm_mul_ps(pp4,dy));
      nz = _mm_add_ps(nz,_mm_mul_ps(pp4,dz));
      if(T::Color==1)
      {
        akkur = _mm_add_ps(akkur,_mm_mul_ps(pp3,part->cr));
        akkug = _mm_add_ps(akkug,_mm_mul_ps(pp3,part->cg));
        akkub = _mm_add_ps(akkub,_mm_mul_ps(pp3,part->cb));
        good = 1;
      }
    }
  }

  sF32 p = 0;
  sVector30 n;
  
  _MM_TRANSPOSE4_PS(po,nx,ny,nz);
  __m128 r = _mm_add_ps(_mm_add_ps(_mm_add_ps(nx,ny),nz),po);
  n.x = r.m128_f32[1];
  n.y = r.m128_f32[2];
  n.z = r.m128_f32[3];
  p = r.m128_f32[0];

  if(p==0)
    n.Init(0,0,0);
  else
    n.UnitFast();
  pot.x = n.x;
  pot.y = n.y;
  pot.z = n.z;
  pot.w = p-fi.iso;
  if(T::Color)
  {
    if(good)
    {
      r = _mm_mul_ss(s255,_mm_rcp_ss(r));
  //    r = _mm_rcp_ss(r);
      _MM_TRANSPOSE4_PS(akkub,akkug,akkur,akkua);
      __m128 r2 = _mm_add_ps(_mm_add_ps(_mm_add_ps(akkur,akkug),akkub),akkua);

      r2 = _mm_mul_ps(r2,_mm_shuffle_ps(r,r,0x00));
      __m128i r3 = _mm_cvtps_epi32(r2);
      r3 = _mm_packs_epi32(r3,r3);
      __m128i r4 = _mm_packus_epi16(r3,r3);
      pot.c = r4.m128i_u32[0]|0xff000000;
    }
    else
    {
      pot.c = 0;
    }
  }
}
예제 #15
0
M_Matrix44
M_MatrixInvert44_SSE(M_Matrix44 A)
{
	M_Matrix44 Ainv;
	float *src = &A.m[0][0];
	float *dst = &Ainv.m[0][0];
	__m128 minor0, minor1, minor2, minor3;
	__m128 row0, row1, row2, row3;
	__m128 det, tmp1;

	tmp1	= _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64 *)(src)),
	                                          (__m64 *)(src+4));
	row1	= _mm_loadh_pi(_mm_loadl_pi(row1, (__m64 *)(src+8)),
	                                          (__m64 *)(src+12));
	row0	= _mm_shuffle_ps(tmp1, row1, 0x88);
	row1	= _mm_shuffle_ps(row1, tmp1, 0xDD);
	tmp1	= _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64 *)(src+2)),
	                                          (__m64 *)(src+6));
	row3	= _mm_loadh_pi(_mm_loadl_pi(row3, (__m64 *)(src+10)),
	                                          (__m64 *)(src+14));
	row2	= _mm_shuffle_ps(tmp1, row3, 0x88);
	row3	= _mm_shuffle_ps(row3, tmp1, 0xDD);

	tmp1	= _mm_mul_ps(row2, row3);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor0	= _mm_mul_ps(row1, tmp1);
	minor1	= _mm_mul_ps(row0, tmp1);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0	= _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
	minor1	= _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
	minor1	= _mm_shuffle_ps(minor1, minor1, 0x4E);

	tmp1	= _mm_mul_ps(row1, row2);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor0	= _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
	minor3	= _mm_mul_ps(row0, tmp1);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0	= _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
	minor3	= _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
	minor3	= _mm_shuffle_ps(minor3, minor3, 0x4E);

	tmp1	= _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	row2	= _mm_shuffle_ps(row2, row2, 0x4E);
	minor0	= _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
	minor2	= _mm_mul_ps(row0, tmp1);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0	= _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
	minor2	= _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
	minor2	= _mm_shuffle_ps(minor2, minor2, 0x4E);

	tmp1	= _mm_mul_ps(row0, row1);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor2	= _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
	minor3	= _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor2	= _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
	minor3	= _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));

	tmp1	= _mm_mul_ps(row0, row3);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor1	= _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
	minor2	= _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor1	= _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
	minor2	= _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));

	tmp1	= _mm_mul_ps(row0, row2);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor1	= _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
	minor3	= _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor1	= _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
	minor3	= _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);

	det	= _mm_mul_ps(row0, minor0);
	det	= _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
	det	= _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
	tmp1	= _mm_rcp_ss(det);
	det	= _mm_sub_ss(_mm_add_ss(tmp1, tmp1),
	                     _mm_mul_ss(det, _mm_mul_ss(tmp1,tmp1)));
	det	= _mm_shuffle_ps(det, det, 0x00);

	minor0	= _mm_mul_ps(det, minor0);
	_mm_storel_pi((__m64 *)(dst), minor0);
	_mm_storeh_pi((__m64 *)(dst+2), minor0);

	minor1	= _mm_mul_ps(det, minor1);
	_mm_storel_pi((__m64 *)(dst+4), minor1);
	_mm_storeh_pi((__m64 *)(dst+6), minor1);
	
	minor2	= _mm_mul_ps(det, minor2);
	_mm_storel_pi((__m64 *)(dst+8), minor2);
	_mm_storeh_pi((__m64 *)(dst+10), minor2);
	
	minor3	= _mm_mul_ps(det, minor3);
	_mm_storel_pi((__m64 *)(dst+12), minor3);
	_mm_storeh_pi((__m64 *)(dst+14), minor3);

	return (Ainv);
}