Пример #1
0
Point2d CameraPinhole::Project(const Point3d& p3d)
{
#ifdef __SSE__
    if(p3d.z==1.)
    {
        __m128d xy = _mm_setr_pd(p3d.x,p3d.y);
        xy=_mm_add_pd(_mm_setr_pd(cx,cy),_mm_mul_pd(xy,(__m128d){fx,fy}));
        return *(Point2d*)&xy;
    }
    else if(p3d.z>0)
    {
        double z_inv=1./p3d.z;
        return Point2d(fx*z_inv*p3d.x+cx,fy*z_inv*p3d.y+cy);
    }
    else return Point2d(-1,-1);
#else
    if(p3d.z==1.)
    {
        return Point2d(fx*p3d.x+cx,fy*p3d.y+cy);
    }
    else if(p3d.z>0)
    {
        double z_inv=1./p3d.z;
        return Point2d(fx*z_inv*p3d.x+cx,fy*z_inv*p3d.y+cy);
    }
    else return Point2d(-1,-1);
#endif
}
Пример #2
0
// compactified (sparse matrix rep) su(2) multiply of,
//
//     | a b || M[row(a)] M[row(a)++] .... |   
//     | c d || M[row(c)] M[row(c)++] .... |
//
void
shortened_su2_multiply( GLU_complex *w , 
			const GLU_complex a , 
			const GLU_complex b , 
			const size_t su2_index )
{
  register const __m128d A = _mm_setr_pd( creal( a ) , cimag( a ) ) ;
  register const __m128d B = _mm_setr_pd( creal( b ) , cimag( b ) ) ;

  const size_t row_a = NC * (int)( Latt.su2_data[ su2_index ].idx_a / NC ) ;
  const size_t row_c = NC * (int)( Latt.su2_data[ su2_index ].idx_c / NC ) ;

  register __m128d tmp ;
  __m128d *w1 = (__m128d*)( w + row_a ) ;
  __m128d *w2 = (__m128d*)( w + row_c ) ;

  size_t i ;
  for( i = 0 ; i < NC ; i++ ) {
    tmp = *w1 ;
    *w1 = _mm_add_pd( SSE2_MUL( A , *w1 ) , SSE2_MUL( B , *w2 ) ) ;
    *w2 = _mm_sub_pd( SSE2_MULCONJ( A , *w2 ) , SSE2_MULCONJ( B , tmp ) ) ;
    w1++ , w2++ ;
  }
  return ;
}
Пример #3
0
//  compactified M.su(2)^{\dagger} multiply of,
//
//   | M[col(a)]    M[col(b)]    | | a b |^{\dagger}
//   | M[col(a)+NC] M[col(b)+NC] | | c d |
//   | .....        .......      |
//
void
shortened_su2_multiply_dag( GLU_complex *U , 
			    const GLU_complex a , 
			    const GLU_complex b , 
			    const size_t su2_index )
{
  // set A and b to be their conjugates
  register const __m128d A = _mm_setr_pd( creal( a ) , -cimag( a ) ) ;
  register const __m128d B = _mm_setr_pd( creal( b ) , -cimag( b ) ) ;
  
  //GLU_complex U1 , U2 ; // temporaries for caching
  const size_t col_a = (int)( Latt.su2_data[ su2_index ].idx_a % NC ) ;
  const size_t col_b = (int)( Latt.su2_data[ su2_index ].idx_b % NC ) ;

  register __m128d tmp ;
  __m128d *U1 = (__m128d*)( U + col_a ) ;
  __m128d *U2 = (__m128d*)( U + col_b ) ;
  
  size_t i ;
  for( i = 0 ; i < NC ; i++ ) {
    tmp = *U1 ;
    *U1 = _mm_add_pd( SSE2_MUL( tmp , A ) , SSE2_MUL( *U2 , B ) ) ;
    *U2 = _mm_sub_pd( SSE2_MUL_CONJ( *U2 , A ) , SSE2_MUL_CONJ( tmp , B ) ) ;
    U1 += NC ;
    U2 += NC ;
  }
  return ;
}
Пример #4
0
Файл: main.cpp Проект: sclc/DPP
int
main(void)
{
    __m128d y2 = _mm_setr_pd(1,2);
    __m128d y3 = _mm_setr_pd(1,2);

    int r = _mm_testz_pd(y2, y3);
    printf("%d\n", r);

    y2 = _mm_setr_pd(-1,-2);
    y3 = _mm_setr_pd(-1,-2);
    r = _mm_testz_pd(y2, y3);
    printf("%d\n", r);


    __m256d y0 = _mm256_setr_pd(1,2,3,4);
    __m256d y1 = _mm256_setr_pd(1,2,3,4);

    r = _mm256_testz_pd(y0, y1);
    printf("%d\n", r);

    //y1 = _mm256_setr_pd(11,2,3,4);
    y0 = _mm256_setr_pd(-1,-2,-3,-4);
    y1 = _mm256_setr_pd(-1,-2,-3,-4);
    r = _mm256_testz_pd(y0, y1);
    printf("%d\n", r);

    return 0;
}
Пример #5
0
Файл: main.cpp Проект: sclc/DPP
int
main(void)
{
    //_mm256_permute_pd
    __m256d da = _mm256_setr_pd(1,2,3,4);

    printf("da: ");
    for(int i=0; i<sizeof(da)/sizeof(da.m256d_f64[0]); i++)
        printf("%5.1f  ", da.m256d_f64[i]);
    printf("\n");

    __m256d dc = _mm256_permute_pd(da, 0x02);

    printf("dc: ");
    for(int i=0; i<sizeof(dc)/sizeof(dc.m256d_f64[0]); i++)
        printf("%5.1f  ", dc.m256d_f64[i]);
    printf("\n\n");


    //_mm_permute_pd
    __m128d fa = _mm_setr_pd(1, 2);

    printf("fa: ");
    for(int i=0; i<sizeof(fa)/sizeof(fa.m128d_f64[0]); i++)
        printf("%5.1f  ", fa.m128d_f64[i]);
    printf("\n");

    __m128d fc = _mm_permute_pd(fa,0x01);

    printf("fc: ");
    for(int i=0; i<sizeof(fc)/sizeof(fc.m128d_f64[0]); i++)
        printf("%5.1f  ", fc.m128d_f64[i]);
    printf("\n");

    return 0;
}
Пример #6
0
void init_sse_data()
{
#ifdef HAVE_SSE
  if (A_s == 0) {
    posix_memalign ((void**)&A_s, 16, (sizeof(__m128)*12));
    A_s[0]  = _mm_setr_ps ( 1.0/6.0, -3.0/6.0,  3.0/6.0, -1.0/6.0 );
    A_s[0]  = _mm_setr_ps ( 1.0/6.0, -3.0/6.0,  3.0/6.0, -1.0/6.0 );	  
    A_s[1]  = _mm_setr_ps ( 4.0/6.0,  0.0/6.0, -6.0/6.0,  3.0/6.0 );	  
    A_s[2]  = _mm_setr_ps ( 1.0/6.0,  3.0/6.0,  3.0/6.0, -3.0/6.0 );	  
    A_s[3]  = _mm_setr_ps ( 0.0/6.0,  0.0/6.0,  0.0/6.0,  1.0/6.0 );	  
    A_s[4]  = _mm_setr_ps ( -0.5,  1.0, -0.5, 0.0  );		  
    A_s[5]  = _mm_setr_ps (  0.0, -2.0,  1.5, 0.0  );		  
    A_s[6]  = _mm_setr_ps (  0.5,  1.0, -1.5, 0.0  );		  
    A_s[7]  = _mm_setr_ps (  0.0,  0.0,  0.5, 0.0  );		  
    A_s[8]  = _mm_setr_ps (  1.0, -1.0,  0.0, 0.0  );		  
    A_s[9]  = _mm_setr_ps ( -2.0,  3.0,  0.0, 0.0  );		  
    A_s[10] = _mm_setr_ps (  1.0, -3.0,  0.0, 0.0  );		  
    A_s[11] = _mm_setr_ps (  0.0,  1.0,  0.0, 0.0  );                  
  }
                 
#endif
#ifdef HAVE_SSE2
  if (A_d == 0) {
    posix_memalign ((void**)&A_d, 16, (sizeof(__m128d)*24));
    A_d[ 0] = _mm_setr_pd (  3.0/6.0, -1.0/6.0 );	   
    A_d[ 1] = _mm_setr_pd (  1.0/6.0, -3.0/6.0 );	   
    A_d[ 2] = _mm_setr_pd ( -6.0/6.0,  3.0/6.0 );	   
    A_d[ 3] = _mm_setr_pd (  4.0/6.0,  0.0/6.0 );	   
    A_d[ 4] = _mm_setr_pd (  3.0/6.0, -3.0/6.0 );	   
    A_d[ 5] = _mm_setr_pd (  1.0/6.0,  3.0/6.0 );	   
    A_d[ 6] = _mm_setr_pd (  0.0/6.0,  1.0/6.0 );	   
    A_d[ 7] = _mm_setr_pd (  0.0/6.0,  0.0/6.0 );	   
    A_d[ 8] = _mm_setr_pd ( -0.5,  0.0 );		   
    A_d[ 9] = _mm_setr_pd ( -0.5,  1.0 );		   
    A_d[10] = _mm_setr_pd (  1.5,  0.0 );		   
    A_d[11] = _mm_setr_pd (  0.0, -2.0 );		   
    A_d[12] = _mm_setr_pd ( -1.5,  0.0 );		   
    A_d[13] = _mm_setr_pd (  0.5,  1.0 );		   
    A_d[14] = _mm_setr_pd (  0.5,  0.0 );		   
    A_d[15] = _mm_setr_pd (  0.0,  0.0 );		   
    A_d[16] = _mm_setr_pd (  0.0,  0.0 );		   
    A_d[17] = _mm_setr_pd (  1.0, -1.0 );		   
    A_d[18] = _mm_setr_pd (  0.0,  0.0 );		   
    A_d[19] = _mm_setr_pd ( -2.0,  3.0 );		   
    A_d[20] = _mm_setr_pd (  0.0,  0.0 );		   
    A_d[21] = _mm_setr_pd (  1.0, -3.0 );		   
    A_d[22] = _mm_setr_pd (  0.0,  0.0 );		   
    A_d[23] = _mm_setr_pd (  0.0,  1.0 );   
  }                
#endif
}
Пример #7
0
Point2d CameraATAN::Project(const Point3d& p3d)
{
    if(p3d.z<=0) return Point2d(-1,-1);

#ifdef __SSE3__
    if(useDistortion)
    {
        __m128d xy=(__m128d){p3d.x,p3d.y};
        if(p3d.z!=1.)
        {
            xy=_mm_sub_pd(xy,(__m128d){p3d.z,p3d.z});
        }
        __m128d xy2=_mm_mul_pd(xy,xy);

         xy2=_mm_hadd_pd(xy2,xy2);
         xy2=_mm_sqrt_pd(xy2);
        double r=((Point2d*)&xy2)->x;
        if(r < 0.001 || d == 0.0)
            r=1.0;
        else
            r=(d_inv* atan(r * tan2w) / r);
        xy=_mm_mul_pd((__m128d){fx,fy},xy);
        xy=_mm_mul_pd(xy,(__m128d){r,r});
        xy=_mm_add_pd(xy,(__m128d){cx,cy});
        return *(Point2d*)&xy;
    }
    else
    {
        if(p3d.z==1.)
        {
            __m128d xy = _mm_setr_pd(p3d.x,p3d.y);
            xy=_mm_add_pd(_mm_setr_pd(cx,cy),_mm_mul_pd(xy,(__m128d){fx,fy}));
            return *(Point2d*)&xy;
        }
        else if(p3d.z>0)
        {
            double z_inv=1./p3d.z;
            return Point2d(fx*z_inv*p3d.x+cx,fy*z_inv*p3d.y+cy);
        }
    }
#else
    if(useDistortion)
    {
        double X=p3d.x,Y=p3d.y;
        if(p3d.z!=1.)
        {
            double z_inv=1./p3d.z;
            X*=z_inv;Y*=z_inv;
        }
        double r= sqrt(X*X+Y*Y);
        if(r < 0.001 || d == 0.0)
            r= 1.0;
        else
            r=(d_inv* atan(r * tan2w) / r);

        return Point2d(cx + fx * r * X,cy + fy * r * Y);
    }
    else
    {
        if(p3d.z==1.)
        {
            return Point2d(fx*p3d.x+cx,fy*p3d.y+cy);
        }
        else
        {
            double z_inv=1./p3d.z;
            return Point2d(fx*z_inv*p3d.x+cx,fy*z_inv*p3d.y+cy);
        }
    }
#endif
    return Point2d(-1,-1);// let compiler happy
}
Пример #8
0
// rotate a matrix U = su2_i*U where su2_i is an su2 matrix embedded in suN
void
su2_rotate( GLU_complex U[ NCNC ] ,
	    const GLU_complex s0 ,
	    const GLU_complex s1 ,
	    const size_t su2_index )
{
#if NC == 3
  __m128d *u = (__m128d*)U ;
  register const __m128d sm0 = _mm_setr_pd( creal( s0 ) , cimag( s0 ) ) ;
  register const __m128d sm1 = _mm_setr_pd( creal( s1 ) , cimag( s1 ) ) ;
  register __m128d tmp0 , tmp1 , a , b ;
  switch( su2_index%3 ) { // again I don't like this
  case 0 :
    // first one
    a = *( u + 0 ) ; b = *( u + 3 ) ;
    tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) ,
		       SSE2_MUL( sm1 , b ) ) ;
    tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) ,
		       SSE2_MULCONJ( sm1 , a ) ) ;
    *( u + 0 ) = tmp0 ;
    *( u + 3 ) = tmp1 ;
    // second one
    a = *( u + 1 ) ; b = *( u + 4 ) ;
    tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) ,
		       SSE2_MUL( sm1 , b ) ) ;
    tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) ,
		       SSE2_MULCONJ( sm1 , a ) ) ;
    *( u + 1 ) = tmp0 ;
    *( u + 4 ) = tmp1 ;
    // third
    a = *( u + 2 ) ; b = *( u + 5 ) ;
    tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) ,
		       SSE2_MUL( sm1 , b ) ) ;
    tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) ,
		       SSE2_MULCONJ( sm1 , a ) ) ;
    *( u + 2 ) = tmp0 ;
    *( u + 5 ) = tmp1 ;
    break ;
  case 1 :
    // first one
    a = *( u + 3 ) ; b = *( u + 6 ) ;
    tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) ,
		       SSE2_MUL( sm1 , b ) ) ;
    tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) ,
		       SSE2_MULCONJ( sm1 , a ) ) ;
    *( u + 3 ) = tmp0 ;
    *( u + 6 ) = tmp1 ;
    // second one
    a = *( u + 4 ) ; b = *( u + 7 ) ;
    tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) ,
		       SSE2_MUL( sm1 , b ) ) ;
    tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) ,
		       SSE2_MULCONJ( sm1 , a ) ) ;
    *( u + 4 ) = tmp0 ;
    *( u + 7 ) = tmp1 ;
    // third
    a = *( u + 5 ) ; b = *( u + 8 ) ;
    tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) ,
		       SSE2_MUL( sm1 , b ) ) ;
    tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) ,
		       SSE2_MULCONJ( sm1 , a ) ) ;
    *( u + 5 ) = tmp0 ;
    *( u + 8 ) = tmp1 ;
    break ;
  case 2 :
    // first one
    a = *( u + 0 ) ; b = *( u + 6 ) ;
    tmp0 = _mm_sub_pd( SSE2_MULCONJ( sm0 , a ) ,
		       SSE2_MULCONJ( sm1 , b ) ) ;
    tmp1 = _mm_add_pd( SSE2_MUL( sm0 , b ) ,
		       SSE2_MUL( sm1 , a ) ) ;
    *( u + 0 ) = tmp0 ;
    *( u + 6 ) = tmp1 ;
    // second
    a = *( u + 1 ) ; b = *( u + 7 ) ;
    tmp0 = _mm_sub_pd( SSE2_MULCONJ( sm0 , a ) ,
		       SSE2_MULCONJ( sm1 , b ) ) ;
    tmp1 = _mm_add_pd( SSE2_MUL( sm0 , b ) ,
		       SSE2_MUL( sm1 , a ) ) ;
    *( u + 1 ) = tmp0 ;
    *( u + 7 ) = tmp1 ;
    // third
    a = *( u + 2 ) ; b = *( u + 8 ) ;
    tmp0 = _mm_sub_pd( SSE2_MULCONJ( sm0 , a ) ,
		       SSE2_MULCONJ( sm1 , b ) ) ;
    tmp1 = _mm_add_pd( SSE2_MUL( sm0 , b ) ,
		       SSE2_MUL( sm1 , a ) ) ;
    *( u + 2 ) = tmp0 ;
    *( u + 8 ) = tmp1 ;
    break ;
  }
#elif NC == 2
  __m128d *u = (__m128d*)U ;
  register const __m128d sm0 = _mm_setr_pd( creal( s0 ) , cimag( s0 ) ) ;
  register const __m128d sm1 = _mm_setr_pd( creal( s1 ) , cimag( s1 ) ) ;
  *( u + 0 ) = _mm_add_pd( SSE2_MUL( sm0 , *( u + 0 ) ) ,
			   SSE2_MUL( sm1 , *( u + 2 ) ) ) ;
  *( u + 1 ) = _mm_add_pd( SSE2_MUL( sm0 , *( u + 1 ) ) ,
			   SSE2_MUL( sm1 , *( u + 3 ) ) ) ;
  *( u + 2 ) = SSE_FLIP( SSE2_CONJ( *( u + 1 ) ) ) ; 
  *( u + 3 ) = SSE2_CONJ( *( u + 0 ) ) ;
#else
  // just a call to su2 multiply
  shortened_su2_multiply( U , s0 , s1 , su2_index ) ;
#endif
  return ;
}
Пример #9
0
 inline vector2d(double f0, double f1) : m_value(_mm_setr_pd(f0, f1)) {
 }
Пример #10
0
	static inline Simd set(double x, double y, double z, double w) {
		Simd res;
		res.reg[0] = _mm_setr_pd(x, y);
		res.reg[1] = _mm_setr_pd(z, w);
		return res;
	}
Пример #11
0
	static inline Simd set(double x, double y) {
		Simd res;
		res.reg = _mm_setr_pd(x, y);
		return res;
	}
Пример #12
0
mlib_status
__mlib_SignalIIR_Biquad_F32S_F32S(
    mlib_f32 *dst,
    const mlib_f32 *src,
    void *filter,
    mlib_s32 n)
{
	mlib_s32 i, tmp;

	mlib_d64 a0, a1, a2, b1, b2, x00, x01, x10, x11, x20, x21, y10, y11,
	    y20, y21, r00, r01;

	__m128d sa0, sa1, sa2, sb1, sb2;
	__m128d sx1, sx2, sy1, sy2, sr0, sx0;
	__m128d stmp1, stmp2, stmp3, stmp4, stmp5;
	mlib_d64 tr0[2], tx1[2], tx2[2], ty1[2], ty2[2];

	mlib_IIR_filt_F32S *pflt = (mlib_IIR_filt_F32S *) filter;

	if (filter == NULL || src == NULL || dst == NULL)
		return (MLIB_NULLPOINTER);

	if (n <= 0)
		return (MLIB_OUTOFRANGE);

	n *= 2;

	a0 = pflt->a0;
	a1 = pflt->a1;
	a2 = pflt->a2;

	b1 = pflt->b1;
	b2 = pflt->b2;

	x10 = pflt->x10;
	x20 = pflt->x20;
	y10 = pflt->y10;
	y20 = pflt->y20;
	x11 = pflt->x11;
	x21 = pflt->x21;
	y11 = pflt->y11;
	y21 = pflt->y21;

	sa0 = _mm_set1_pd(a0);
	sa1 = _mm_set1_pd(a1);
	sa2 = _mm_set1_pd(a2);
	sb1 = _mm_set1_pd(b1);
	sb2 = _mm_set1_pd(b2);

	sx1 = _mm_setr_pd(x10, x11);
	sx2 = _mm_setr_pd(x20, x21);
	sx0 = _mm_setr_pd(src[0], src[1]);
	sy1 = _mm_setr_pd(y10, y11);
	sy2 = _mm_setr_pd(y20, y21);

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
	for (i = 0; i < n; i += 2) {
		stmp1 = _mm_mul_pd(sa0, sx0);
		stmp2 = _mm_mul_pd(sa1, sx1);
		stmp3 = _mm_mul_pd(sa2, sx2);
		stmp4 = _mm_mul_pd(sb2, sy2);
		stmp5 = _mm_mul_pd(sb1, sy1);
		stmp1 = _mm_add_pd(stmp1, stmp2);
		stmp3 = _mm_add_pd(stmp3, stmp4);
		stmp1 = _mm_add_pd(stmp1, stmp3);
		sr0 = _mm_add_pd(stmp1, stmp5);

		sx2 = sx1;
		sx1 = sx0;
		sx0 = _mm_setr_pd(src[2], src[3]);
		sy2 = sy1;
		sy1 = sr0;

		_mm_storeu_pd(tr0, sr0);

		dst[i] = tr0[0];
		dst[i + 1] = tr0[1];

		src += 2;
	}
	_mm_storeu_pd(tx1, sx1);
	_mm_storeu_pd(tx2, sx2);
	_mm_storeu_pd(ty1, sy1);
	_mm_storeu_pd(ty2, sy2);

	pflt->x10 = tx1[0];
	pflt->x11 = tx1[1];
	pflt->x20 = tx2[0];
	pflt->x21 = tx2[1];

	pflt->y10 = ty1[0];
	pflt->y11 = ty1[1];
	pflt->y20 = ty2[0];
	pflt->y21 = ty2[1];

	return (MLIB_SUCCESS);
}
Пример #13
0
mlib_status
__mlib_SignalIIR_Biquad_F32_F32(
    mlib_f32 *dst,
    const mlib_f32 *src,
    void *filter,
    mlib_s32 n)
{
	mlib_d64 a0, a1, a2, b1, b2, x1, x2, y1, y2, r0, x0;

	mlib_s32 i, tmp;

	__m128d sa0, sa1, sa2, sb1, sb2, sx1, sx2, sy1, sy2, sr0, sx0;
	__m128d stmp1, stmp2, stmp3, stmp4, stmp5;
	mlib_d64 tr0[2], tx0[2], tx1[2], tx2[2], ty1[2], ty2[2];

	mlib_IIR_filt_F32 *pflt = (mlib_IIR_filt_F32 *) filter;

	if (filter == NULL || src == NULL || dst == NULL)
		return (MLIB_NULLPOINTER);

	if (n <= 0)
		return (MLIB_OUTOFRANGE);

	a0 = pflt->a0;
	a1 = pflt->a1;
	a2 = pflt->a2;

	b1 = pflt->b1;
	b2 = pflt->b2;

	x1 = pflt->x1;
	x2 = pflt->x2;
	y1 = pflt->y1;
	y2 = pflt->y2;

	for (i = 0; (i < 2) && (i < n); i++) {
		x0 = src[i];

		r0 = a0 * x0 + a1 * x1 + a2 * x2 + b2 * y2 + b1 * y1;

		x2 = x1;
		x1 = x0;

		y2 = y1;
		y1 = r0;

		dst[i] = r0;
	}

	sa0 = _mm_set1_pd(pflt->a0);
	sa1 = _mm_set1_pd(pflt->a1);
	sa2 = _mm_set1_pd(pflt->a2);
	sb1 = _mm_set1_pd(pflt->b1);
	sb2 = _mm_set1_pd(pflt->b2);

	x0 = src[i];
	sx0 = _mm_setr_pd(src[i], src[i + 1]);
	sx1 = _mm_setr_pd(x1, x0);
	sx2 = _mm_setr_pd(x2, x1);
	sy1 = _mm_setr_pd(y1, 0);
	sy2 = _mm_setr_pd(y2, y1);

	for (; i < n - 1; i += 2) {

		stmp1 = _mm_mul_pd(sa0, sx0);
		stmp2 = _mm_mul_pd(sa1, sx1);
		stmp3 = _mm_mul_pd(sa2, sx2);
		stmp4 = _mm_mul_pd(sb2, sy2);
		stmp5 = _mm_mul_pd(sb1, sy1);
		stmp1 = _mm_add_pd(stmp1, stmp2);
		stmp3 = _mm_add_pd(stmp3, stmp4);
		stmp1 = _mm_add_pd(stmp1, stmp3);
		sr0 = _mm_add_pd(stmp1, stmp5);

		_mm_storeu_pd(tr0, sr0);
		tr0[1] += (b1 * tr0[0]);
		_mm_storeu_pd(tx0, sx0);

		sx2 = sx0;
		sx1 = _mm_setr_pd(tx0[1], src[i + 2]);
		sx0 = _mm_setr_pd(src[i + 2], src[i + 3]);
		sy1 = _mm_setr_pd(tr0[1], 0);
		sy2 = _mm_setr_pd(tr0[0], tr0[1]);

		dst[i] = tr0[0];
		dst[i + 1] = tr0[1];
	}

	_mm_storeu_pd(tx1, sx1);
	x1 = tx1[0];
	_mm_storeu_pd(tx2, sx2);
	x2 = tx2[0];
	_mm_storeu_pd(ty1, sy1);
	y1 = ty1[0];
	_mm_storeu_pd(ty2, sy2);
	y2 = ty2[0];

	for (; i < n; i++) {
		x0 = src[i];

		r0 = a0 * x0 + a1 * x1 + a2 * x2 + b2 * y2 + b1 * y1;

		x2 = x1;
		x1 = x0;

		y2 = y1;
		y1 = r0;

		dst[i] = r0;
	}

	pflt->x1 = x1;
	pflt->x2 = x2;

	pflt->y1 = y1;
	pflt->y2 = y2;

	return (MLIB_SUCCESS);
}
Пример #14
0
mlib_status
__mlib_SignalIIR_Biquad_S16_S16_Sat(
    mlib_s16 *dst,
    const mlib_s16 *src,
    void *filter,
    mlib_s32 n)
{
#ifndef MLIB_USE_FTOI_CLAMPING
	mlib_s32 d;
#endif /* MLIB_USE_FTOI_CLAMPING */
	mlib_d64 a0, a1, a2, b1, b2, x1, x2, y1, y2, r0, x0;

	mlib_s32 i, j, tmp;
	__m128d sa0, sa1, sa2, sb1, sb2, sx1, sx2, sy1, sy2, sr0, sx0;
	__m128d stmp1, stmp2, stmp3, stmp4, stmp5;
	mlib_d64 tr0[2], tx0[2], tx1[2], tx2[2], ty1[2], ty2[2];

	mlib_IIR_filt_S16 *pflt = (mlib_IIR_filt_S16 *) filter;

	if (filter == NULL || src == NULL || dst == NULL)
		return (MLIB_NULLPOINTER);

	if (n <= 0)
		return (MLIB_OUTOFRANGE);

	a0 = pflt->a0;
	a1 = pflt->a1;
	a2 = pflt->a2;

	b1 = pflt->b1;
	b2 = pflt->b2;

	x1 = pflt->x1;
	x2 = pflt->x2;
	y1 = pflt->y1;
	y2 = pflt->y2;

	for (j = 0; (j < n) &&(j < 2); j++) {
		x0 = src[j];

		r0 = a0 * x0 + a1 * x1 + a2 * x2 + b2 * y2 + b1 * y1;

		x2 = x1;
		x1 = x0;

		y2 = y1;
		y1 = r0;

#ifndef MLIB_USE_FTOI_CLAMPING

		if (r0 > MLIB_S16_MAX)
			d = MLIB_S16_MAX;
		else if (r0 < MLIB_S16_MIN)
			d = MLIB_S16_MIN;
		else
			d = (mlib_s16)r0;

		dst[j] = d;
#else /* MLIB_USE_FTOI_CLAMPING */
		dst[j] = ((mlib_s16)r0);
#endif /* MLIB_USE_FTOI_CLAMPING */
	}

	sa0 = _mm_set1_pd(a0);
	sa1 = _mm_set1_pd(a1);
	sa2 = _mm_set1_pd(a2);
	sb1 = _mm_set1_pd(b1);
	sb2 = _mm_set1_pd(b2);

	sx0 = _mm_setr_pd(src[j], src[j + 1]);
	sx1 = _mm_setr_pd(x1, src[j]);
	sx2 = _mm_setr_pd(x2, x1);
	sy1 = _mm_setr_pd(y1, 0);
	sy2 = _mm_setr_pd(y2, y1);

	for (i = j; i < n - 1; i += 2) {
		stmp1 = _mm_mul_pd(sa0, sx0);
		stmp2 = _mm_mul_pd(sa1, sx1);
		stmp3 = _mm_mul_pd(sa2, sx2);
		stmp4 = _mm_mul_pd(sb1, sy1);
		stmp5 = _mm_mul_pd(sb2, sy2);

		stmp1 = _mm_add_pd(stmp1, stmp2);
		stmp3 = _mm_add_pd(stmp3, stmp4);
		stmp1 = _mm_add_pd(stmp1, stmp3);
		sr0 = _mm_add_pd(stmp1, stmp5);

		_mm_storeu_pd(tr0, sr0);
		tr0[1] += (b1 * tr0[0]);

		_mm_storeu_pd(tx0, sx0);

		sx2 = sx0;
		sx1 = _mm_setr_pd(tx0[1], src[i + 2]);
		sx0 = _mm_setr_pd(src[i + 2], src[i + 3]);
		sy1 = _mm_setr_pd(tr0[1], 0);
		sy2 = _mm_setr_pd(tr0[0], tr0[1]);

#ifndef MLIB_USE_FTOI_CLAMPING
		if (tr0[0] > MLIB_S16_MAX)
			d = MLIB_S16_MAX;
		else if (tr0[0] < MLIB_S16_MIN)
			d = MLIB_S16_MIN;
		else
			d = (mlib_s16)tr0[0];

		dst[i] = d;
#else /* MLIB_USE_FTOI_CLAMPING */
		dst[i] = (mlib_s16)tr0[0];
#endif /* MLIB_USE_FTOI_CLAMPING */

#ifndef MLIB_USE_FTOI_CLAMPING
		if (tr0[1] > MLIB_S16_MAX)
			d = MLIB_S16_MAX;
		else if (tr0[1] < MLIB_S16_MIN)
			d = MLIB_S16_MIN;
		else
			d = (mlib_s16)tr0[1];

		dst[i + 1] = d;
#else /* MLIB_USE_FTOI_CLAMPING */
		dst[i + 1] = (mlib_s16)tr0[1];
#endif /* MLIB_USE_FTOI_CLAMPING */
	}

		_mm_storeu_pd(tx1, sx1);
		x1 = tx1[0];
		_mm_storeu_pd(tx2, sx2);
		x2 = tx2[0];

		_mm_storeu_pd(ty1, sy1);
		y1 = ty1[0];
		_mm_storeu_pd(ty2, sy2);
		y2 = ty2[0];

	for (; (i < n); i++) {
		x0 = src[i];

		r0 = a0 * x0 + a1 * x1 + a2 * x2 + b2 * y2 + b1 * y1;

		x2 = x1;
		x1 = x0;

		y2 = y1;
		y1 = r0;

#ifndef MLIB_USE_FTOI_CLAMPING

		if (r0 > MLIB_S16_MAX)
			d = MLIB_S16_MAX;
		else if (r0 < MLIB_S16_MIN)
			d = MLIB_S16_MIN;
		else
			d = (mlib_s16)r0;

		dst[i] = d;
#else /* MLIB_USE_FTOI_CLAMPING */
		dst[i] = ((mlib_s16)r0);
#endif /* MLIB_USE_FTOI_CLAMPING */
	}

	pflt->x1 = x1;
	pflt->x2 = x2;
	pflt->y1 = y1;
	pflt->y2 = y2;

	return (MLIB_SUCCESS);
}
Пример #15
0
DBL AVXFMA4Noise(const Vector3d& EPoint, int noise_generator)
{
    DBL x, y, z;
    DBL *mp;
    int ix, iy, iz;
    int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash;
    DBL sum;

    // TODO FIXME - global statistics reference
    // Stats[Calls_To_Noise]++;

    if (noise_generator==kNoiseGen_Perlin)
    {
        // The 1.59 and 0.985 are to correct for some biasing problems with
        // the random # generator used to create the noise tables.  Final
        // range of values is about 5.0e-4 below 0.0 and above 1.0.  Mean
        // value is 0.49 (ideally it would be 0.5).
        sum = 0.5 * (1.59 * SolidNoise(EPoint) + 0.985);

        // Clamp final value to 0-1 range
            if (sum < 0.0) sum = 0.0;
            if (sum > 1.0) sum = 1.0;

        return sum;
    }

    x = EPoint[X];
    y = EPoint[Y];
    z = EPoint[Z];

    /* its equivalent integer lattice point. */
    /* ix = (int)x; iy = (int)y; iz = (long)z; */
    /* JB fix for the range problem */

    __m128d xy = _mm_setr_pd(x, y);
    __m128d zn = _mm_set_sd(z);
    __m128d epsy = _mm_set1_pd(1.0 - EPSILON);
    __m128d xy_e = _mm_sub_pd(xy, epsy);
    __m128d zn_e = _mm_sub_sd(zn, epsy);
    __m128i tmp_xy = _mm_cvttpd_epi32(_mm_blendv_pd(xy, xy_e, xy));
    __m128i tmp_zn = _mm_cvttpd_epi32(_mm_blendv_pd(zn, zn_e, zn));

    __m128i noise_min_xy = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, 0, 0);
    __m128i noise_min_zn = _mm_set1_epi32(NOISE_MINZ);

    __m128d xy_ixy = _mm_sub_pd(xy, _mm_cvtepi32_pd(tmp_xy));
    __m128d zn_izn = _mm_sub_sd(zn, _mm_cvtepi32_pd(tmp_zn));

    const __m128i fff = _mm_set1_epi32(0xfff);
    __m128i i_xy = _mm_and_si128(_mm_sub_epi32(tmp_xy, noise_min_xy), fff);
    __m128i i_zn = _mm_and_si128(_mm_sub_epi32(tmp_zn, noise_min_zn), fff);

    ix = _mm_extract_epi32(i_xy, 0);
    iy = _mm_extract_epi32(i_xy, 1);
    iz = _mm_extract_epi32(i_zn, 0);

    ixiy_hash = Hash2d(ix, iy);
    jxiy_hash = Hash2d(ix + 1, iy);
    ixjy_hash = Hash2d(ix, iy + 1);
    jxjy_hash = Hash2d(ix + 1, iy + 1);

    mp = &RTable[Hash1dRTableIndex(ixiy_hash, iz)];
    DBL *mp2 = &RTable[Hash1dRTableIndex(ixjy_hash, iz)];
    DBL *mp3 = &RTable[Hash1dRTableIndex(ixiy_hash, iz + 1)];
    DBL *mp4 = &RTable[Hash1dRTableIndex(ixjy_hash, iz + 1)];
    DBL *mp5 = &RTable[Hash1dRTableIndex(jxiy_hash, iz)];
    DBL *mp6 = &RTable[Hash1dRTableIndex(jxjy_hash, iz)];
    DBL *mp7 = &RTable[Hash1dRTableIndex(jxiy_hash, iz + 1)];
    DBL *mp8 = &RTable[Hash1dRTableIndex(jxjy_hash, iz + 1)];

    const __m128d three = _mm_set1_pd(3.0);
    const __m128d two = _mm_set1_pd(2.0);
    const __m128d one = _mm_set1_pd(1.0);

    __m128d ix_mm = _mm_unpacklo_pd(xy_ixy, xy_ixy);
    __m128d iy_mm = _mm_unpackhi_pd(xy_ixy, xy_ixy);
    __m128d iz_mm = _mm_unpacklo_pd(zn_izn, zn_izn);

    __m128d jx_mm = _mm_sub_pd(ix_mm, one);
    __m128d jy_mm = _mm_sub_pd(iy_mm, one);
    __m128d jz_mm = _mm_sub_pd(iz_mm, one);

    __m128d mm_sxy = _mm_mul_pd(_mm_mul_pd(xy_ixy, xy_ixy), _mm_nmacc_pd(two, xy_ixy, three));
    __m128d mm_sz = _mm_mul_pd(_mm_mul_pd(iz_mm, iz_mm), _mm_nmacc_pd(two, iz_mm, three));

    __m128d mm_tz = _mm_sub_pd(one, mm_sz);
    __m128d mm_txy = _mm_sub_pd(one, mm_sxy);
    __m128d mm_tysy = _mm_unpackhi_pd(mm_txy, mm_sxy);
    __m128d mm_txty_txsy = _mm_mul_pd(_mm_unpacklo_pd(mm_txy, mm_txy), mm_tysy);
    __m128d mm_sxty_sxsy = _mm_mul_pd(_mm_unpacklo_pd(mm_sxy, mm_sxy), mm_tysy);

    __m128d y_mm = _mm_unpacklo_pd(iy_mm, jy_mm);

    __m128d mp_t1, mp_t2, mp1_mm, mp2_mm, mp4_mm, mp6_mm, sum_p, s_mm;
    __m128d int_sum1 = _mm_setzero_pd();

    s_mm = _mm_mul_pd(mm_txty_txsy, mm_tz);
    INCRSUMP2(mp, mp2, s_mm, ix_mm, y_mm, iz_mm, int_sum1);

    s_mm = _mm_mul_pd(mm_txty_txsy, mm_sz);
    INCRSUMP2(mp3, mp4, s_mm, ix_mm, y_mm, jz_mm, int_sum1);

    s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_tz);
    INCRSUMP2(mp5, mp6, s_mm, jx_mm, y_mm, iz_mm, int_sum1);

    s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_sz);
    INCRSUMP2(mp7, mp8, s_mm, jx_mm, y_mm, jz_mm, int_sum1);

    int_sum1 = _mm_hadd_pd(int_sum1, int_sum1);

    if(noise_generator==kNoiseGen_RangeCorrected)
    {
        /* details of range here:
        Min, max: -1.05242, 0.988997
        Mean: -0.0191481, Median: -0.535493, Std Dev: 0.256828

        We want to change it to as close to [0,1] as possible.
        */
        const __m128d r2 = _mm_set_sd(0.48985582);
        const __m128d r1r2 = _mm_set_sd(1.05242*0.48985582);
        int_sum1 = _mm_macc_sd(int_sum1, r2, r1r2);
    }
    else
    {
        int_sum1 = _mm_add_sd(int_sum1, _mm_set_sd(0.5));
    }

    int_sum1 = _mm_min_sd(one, int_sum1);
    int_sum1 = _mm_max_sd(_mm_setzero_pd(), int_sum1);
    _mm_store_sd(&sum, int_sum1);

    return (sum);
}
Пример #16
0
void AVXFMA4DNoise(Vector3d& result, const Vector3d& EPoint)
{
    DBL x, y, z;
    int ix, iy, iz;
    int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash;

    // TODO FIXME - global statistics reference
    // Stats[Calls_To_DNoise]++;

    x = EPoint[X];
    y = EPoint[Y];
    z = EPoint[Z];

    /* its equivalent integer lattice point. */
    /*ix = (int)x; iy = (int)y; iz = (int)z;
    x_ix = x - ix; y_iy = y - iy; z_iz = z - iz;*/
                /* JB fix for the range problem */

    __m128d xy = _mm_setr_pd(x, y);
    __m128d zn = _mm_set_sd(z);
    __m128d epsy = _mm_set1_pd(1.0 - EPSILON);
    __m128d xy_e = _mm_sub_pd(xy, epsy);
    __m128d zn_e = _mm_sub_sd(zn, epsy);
    __m128i tmp_xy = _mm_cvttpd_epi32(_mm_blendv_pd(xy, xy_e, xy));
    __m128i tmp_zn = _mm_cvttpd_epi32(_mm_blendv_pd(zn, zn_e, zn));

    __m128i noise_min_xy = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, 0, 0);
    __m128i noise_min_zn = _mm_set1_epi32(NOISE_MINZ);

    __m128d xy_ixy = _mm_sub_pd(xy, _mm_cvtepi32_pd(tmp_xy));
    __m128d zn_izn = _mm_sub_sd(zn, _mm_cvtepi32_pd(tmp_zn));

    const __m128i fff = _mm_set1_epi32(0xfff);
    __m128i i_xy = _mm_and_si128(_mm_sub_epi32(tmp_xy, noise_min_xy), fff);
    __m128i i_zn = _mm_and_si128(_mm_sub_epi32(tmp_zn, noise_min_zn), fff);

    ix = _mm_extract_epi32(i_xy, 0);
    iy = _mm_extract_epi32(i_xy, 1);
    iz = _mm_extract_epi32(i_zn, 0);

    ixiy_hash = Hash2d(ix, iy);
    jxiy_hash = Hash2d(ix + 1, iy);
    ixjy_hash = Hash2d(ix, iy + 1);
    jxjy_hash = Hash2d(ix + 1, iy + 1);

    DBL* mp1 = &RTable[Hash1dRTableIndex(ixiy_hash, iz)];
    DBL* mp2 = &RTable[Hash1dRTableIndex(jxiy_hash, iz)];
    DBL* mp3 = &RTable[Hash1dRTableIndex(jxjy_hash, iz)];
    DBL* mp4 = &RTable[Hash1dRTableIndex(ixjy_hash, iz)];
    DBL* mp5 = &RTable[Hash1dRTableIndex(ixjy_hash, iz + 1)];
    DBL* mp6 = &RTable[Hash1dRTableIndex(jxjy_hash, iz + 1)];
    DBL* mp7 = &RTable[Hash1dRTableIndex(jxiy_hash, iz + 1)];
    DBL* mp8 = &RTable[Hash1dRTableIndex(ixiy_hash, iz + 1)];

    const __m128d three = _mm_set1_pd(3.0);
    const __m128d two = _mm_set1_pd(2.0);
    const __m128d one = _mm_set1_pd(1.0);

    __m128d ix_mm = _mm_unpacklo_pd(xy_ixy, xy_ixy);
    __m128d iy_mm = _mm_unpackhi_pd(xy_ixy, xy_ixy);
    __m128d iz_mm = _mm_unpacklo_pd(zn_izn, zn_izn);

    __m128d jx_mm = _mm_sub_pd(ix_mm, one);
    __m128d jy_mm = _mm_sub_pd(iy_mm, one);
    __m128d jz_mm = _mm_sub_pd(iz_mm, one);

    __m128d mm_sz = _mm_mul_pd(_mm_mul_pd(iz_mm, iz_mm), _mm_nmacc_pd(two, iz_mm, three));

    __m128d mm_tz = _mm_sub_pd(one, mm_sz);

    __m128d mm_sxy = _mm_mul_pd(_mm_mul_pd(xy_ixy, xy_ixy), _mm_nmacc_pd(two, xy_ixy, three));

    __m128d mm_txy = _mm_sub_pd(one, mm_sxy);
    __m128d mm_tysy = _mm_unpackhi_pd(mm_txy, mm_sxy);
    __m128d mm_txty_txsy = _mm_mul_pd(_mm_unpacklo_pd(mm_txy, mm_txy), mm_tysy);
    __m128d mm_sxty_sxsy = _mm_mul_pd(_mm_unpacklo_pd(mm_sxy, mm_sxy), mm_tysy);

    __m128d mm_txty_txsy_tz = _mm_mul_pd(mm_txty_txsy, mm_tz);
    __m128d mm_txty_txsy_sz = _mm_mul_pd(mm_txty_txsy, mm_sz);
    __m128d mm_sxty_sxsy_tz = _mm_mul_pd(mm_sxty_sxsy, mm_tz);
    __m128d mm_sxty_sxsy_sz = _mm_mul_pd(mm_sxty_sxsy, mm_sz);

    __m128d mp_t1, mp_t2, mp1_mm, mp2_mm, mp4_mm, mp6_mm, sum_p;
    __m128d sum_X_Y = _mm_setzero_pd();
    __m128d sum__Z = _mm_setzero_pd();

    __m128d mm_s1 = _mm_unpacklo_pd(mm_txty_txsy_tz, mm_txty_txsy_tz);
    INCRSUMP2(mp1, mp1 + 8, mm_s1, ix_mm, iy_mm, iz_mm, sum_X_Y);

    __m128d mm_s2 = _mm_unpacklo_pd(mm_sxty_sxsy_tz, mm_sxty_sxsy_tz);
    INCRSUMP2(mp2, mp2 + 8, mm_s2, jx_mm, iy_mm, iz_mm, sum_X_Y);

    __m128d mm_s3 = _mm_unpackhi_pd(mm_sxty_sxsy_tz, mm_sxty_sxsy_tz);
    INCRSUMP2(mp3, mp3 + 8, mm_s3, jx_mm, jy_mm, iz_mm, sum_X_Y);

    __m128d mm_s4 = _mm_unpackhi_pd(mm_txty_txsy_tz, mm_txty_txsy_tz);
    INCRSUMP2(mp4, mp4 + 8, mm_s4, ix_mm, jy_mm, iz_mm, sum_X_Y);

    __m128d mm_s5 = _mm_unpackhi_pd(mm_txty_txsy_sz, mm_txty_txsy_sz);
    INCRSUMP2(mp5, mp5 + 8, mm_s5, ix_mm, jy_mm, jz_mm, sum_X_Y);

    __m128d mm_s6 = _mm_unpackhi_pd(mm_sxty_sxsy_sz, mm_sxty_sxsy_sz);
    INCRSUMP2(mp6, mp6 + 8, mm_s6, jx_mm, jy_mm, jz_mm, sum_X_Y);

    __m128d mm_s7 = _mm_unpacklo_pd(mm_sxty_sxsy_sz, mm_sxty_sxsy_sz);
    INCRSUMP2(mp7, mp7 + 8, mm_s7, jx_mm, iy_mm, jz_mm, sum_X_Y);

    __m128d mm_s8 = _mm_unpacklo_pd(mm_txty_txsy_sz, mm_txty_txsy_sz);
    INCRSUMP2(mp8, mp8 + 8, mm_s8, ix_mm, iy_mm, jz_mm, sum_X_Y);

    __m128d iy_jy = _mm_unpacklo_pd(iy_mm, jy_mm);
    INCRSUMP2(mp1 + 16, mp4 + 16, mm_txty_txsy_tz, ix_mm, iy_jy, iz_mm, sum__Z);
    INCRSUMP2(mp8 + 16, mp5 + 16, mm_txty_txsy_sz, ix_mm, iy_jy, jz_mm, sum__Z);
    INCRSUMP2(mp2 + 16, mp3 + 16, mm_sxty_sxsy_tz, jx_mm, iy_jy, iz_mm, sum__Z);
    INCRSUMP2(mp7 + 16, mp6 + 16, mm_sxty_sxsy_sz, jx_mm, iy_jy, jz_mm, sum__Z);

    sum__Z = _mm_hadd_pd(sum__Z, sum__Z);

    _mm_storeu_pd(*result, sum_X_Y);
    _mm_store_sd(&result[Z], sum__Z);
}
Пример #17
0
static int
gauss_jordan( GLU_complex M_1[ NCNC ] , 
	      const GLU_complex M[ NCNC ] )
{
  __m128d a[ NCNC ] GLUalign ; // temporary space to overwrite matrix
  register __m128d best , attempt , m1 , fac ;
  size_t i , j , piv ;

  // equate the necessary parts into double complex precision
  for( i = 0 ; i < NCNC ; i++ ) {
    a[ i ] = _mm_setr_pd( creal( M[i] ) , cimag( M[i] ) ) ;
    M_1[ i ] = ( i%(NC+1) ) ? 0.0 :1.0 ;
  }

  // set these pointers, pB will be the inverse
  __m128d *pB = (__m128d*)M_1 , *pA = (__m128d*)a ;
  
  // loop over diagonal of the square matrix M
  for( i = 0 ; i < NC-1 ; i++ ) {

    // column pivot by selecting the largest in magnitude value
    piv = i ;
    best = absfac( *( pA + i*(NC+1) ) ) ;
    for( j = i+1 ; j < NC ; j++ ) {
       attempt = absfac( *( pB + i + j*NC ) ) ;
      if( _mm_ucomilt_sd( best , attempt ) ) { 
	piv = j ; 
	best = attempt ; 
      }
    }

    // if we must pivot then we do
    if( piv != i ) {
      swap_rows( pA , pB , piv , i ) ;
    }
  
    // perform gaussian elimination to obtain the upper triangular
    fac = _mm_div_pd( SSE2_CONJ( *( pA + i*(NC+1) ) ) , best ) ;
    for( j = NC-1 ; j > i ; j-- ) { // go up in other columns
      eliminate_column( pA , pB , fac , i , j ) ;
    }
  }

  // a is upper triangular, do the same for the upper half
  // no pivoting to be done here
  for( i = NC-1 ; i > 0 ; i-- ) {
    fac = SSE2_inverse( *( pA + i*(NC+1) ) ) ;
    for( j = 0 ; j < i ; j++ ) {
      eliminate_column( pA , pB , fac , i , j ) ;
    }
  }

  // multiply each row by its M_1 diagonal
  for( j = 0 ; j < NC ; j++ ) {
    m1 = SSE2_inverse( *pA ) ;
    for( i = 0 ; i < NC ; i++ ) {
      *pB = SSE2_MUL( *pB , m1 ) ;
      pB++ ;
    }
    pA += NC+1 ;
  }

  return GLU_SUCCESS ;
}