Пример #1
0
void
multab_suNC( GLU_complex a[ NCNC ] , 
	     const GLU_complex b[ NCNC ] , 
	     const GLU_complex c[ NCNC ] )
{
 // recast to alignment
  __m128d *A = (__m128d*)a ;
  const __m128d *B = (const __m128d*)b ;
  const __m128d *C = (const __m128d*)c ;
#if NC == 3
  // top row
  //a[0] = b[0] * c[0] + b[1] * c[3] + b[2] * c[6] ; 
  *( A + 0 ) = _mm_add_pd( SSE2_MUL( *( B + 0 ) , *( C + 0 ) ) ,
			   _mm_add_pd( SSE2_MUL( *( B + 1 ) , *( C + 3 ) ) ,
				       SSE2_MUL( *( B + 2 ) , *( C + 6 ) ) ) ) ;
  // a[1] = b[0] * c[1] + b[1] * c[4] + b[2] * c[7] ; 
  *( A + 1 ) = _mm_add_pd( SSE2_MUL( *( B + 0 ) , *( C + 1 ) ) ,
			   _mm_add_pd( SSE2_MUL( *( B + 1 ) , *( C + 4 ) ) ,
				       SSE2_MUL( *( B + 2 ) , *( C + 7 ) ) ) ) ;
  // a[2] = b[0] * c[2] + b[1] * c[5] + b[2] * c[8] ;
  *( A + 2 ) = _mm_add_pd( SSE2_MUL( *( B + 0 ) , *( C + 2 ) ) ,
			   _mm_add_pd( SSE2_MUL( *( B + 1 ) , *( C + 5 ) ) ,
				       SSE2_MUL( *( B + 2 ) , *( C + 8 ) ) ) ) ;
  // middle row //
  // a[3] = b[3] * c[0] + b[4] * c[3] + b[5] * c[6] ; 
  *( A + 3 ) = _mm_add_pd( SSE2_MUL( *( B + 3 ) , *( C + 0 ) ) ,
			   _mm_add_pd( SSE2_MUL( *( B + 4 ) , *( C + 3 ) ) ,
				       SSE2_MUL( *( B + 5 ) , *( C + 6 ) ) ) ) ;
  // a[4] = b[3] * c[1] + b[4] * c[4] + b[5] * c[7] ; 
  *( A + 4 ) = _mm_add_pd( SSE2_MUL( *( B + 3 ) , *( C + 1 ) ) ,
			   _mm_add_pd( SSE2_MUL( *( B + 4 ) , *( C + 4 ) ) ,
				       SSE2_MUL( *( B + 5 ) , *( C + 7 ) ) ) ) ;
  // a[5] = b[3] * c[2] + b[4] * c[5] + b[5] * c[8] ; 
  *( A + 5 ) = _mm_add_pd( SSE2_MUL( *( B + 3 ) , *( C + 2 ) ) ,
			   _mm_add_pd( SSE2_MUL( *( B + 4 ) , *( C + 5 ) ) ,
				       SSE2_MUL( *( B + 5 ) , *( C + 8 ) ) ) ) ;
  // bottom row // as a completion of the top two
  // a[6] = conj( a[1] * a[5] - a[2] * a[4] ) ; 
  *( A + 6 ) = SSE2_CONJ( _mm_sub_pd( SSE2_MUL( *( A + 1 ) , *( A + 5 ) ) ,
				      SSE2_MUL( *( A + 2 ) , *( A + 4 ) ) ) ) ;
  // a[7] = conj( a[2] * a[3] - a[0] * a[5] ) ; 
  *( A + 7 ) = SSE2_CONJ( _mm_sub_pd( SSE2_MUL( *( A + 2 ) , *( A + 3 ) ) ,
				      SSE2_MUL( *( A + 0 ) , *( A + 5 ) ) ) ) ;
  // a[8] = conj( a[0] * a[4] - a[1] * a[3] ) ; 
  *( A + 8 ) = SSE2_CONJ( _mm_sub_pd( SSE2_MUL( *( A + 0 ) , *( A + 4 ) ) ,
				      SSE2_MUL( *( A + 1 ) , *( A + 3 ) ) ) ) ;
#elif NC == 2
  *( A + 0 ) = _mm_add_pd( SSE2_MUL( *( B + 0 ) , *( C + 0 ) ) ,
			   SSE2_MUL( *( B + 1 ) , *( C + 2 ) ) ) ;
  *( A + 1 ) = _mm_add_pd( SSE2_MUL( *( B + 0 ) , *( C + 1 ) ) ,
			   SSE2_MUL( *( B + 1 ) , *( C + 3 ) ) ) ;
  // a[2] = -conj( a[1] )
  *( A + 2 ) = SSE_FLIP( SSE2_CONJ( *( A + 1 ) ) ) ; 
  // a[3] =  conj( a[0] )
  *( A + 3 ) = SSE2_CONJ( *( A + 0 ) ) ;
#else
  return multab_suNC( a , b , c ) ;
#endif
  return ;
}
Пример #2
0
// rotate a matrix U = su2_i*U where su2_i is an su2 matrix embedded in suN
void
su2_rotate( GLU_complex U[ NCNC ] ,
	    const GLU_complex s0 ,
	    const GLU_complex s1 ,
	    const size_t su2_index )
{
#if NC == 3
  __m128d *u = (__m128d*)U ;
  register const __m128d sm0 = _mm_setr_pd( creal( s0 ) , cimag( s0 ) ) ;
  register const __m128d sm1 = _mm_setr_pd( creal( s1 ) , cimag( s1 ) ) ;
  register __m128d tmp0 , tmp1 , a , b ;
  switch( su2_index%3 ) { // again I don't like this
  case 0 :
    // first one
    a = *( u + 0 ) ; b = *( u + 3 ) ;
    tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) ,
		       SSE2_MUL( sm1 , b ) ) ;
    tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) ,
		       SSE2_MULCONJ( sm1 , a ) ) ;
    *( u + 0 ) = tmp0 ;
    *( u + 3 ) = tmp1 ;
    // second one
    a = *( u + 1 ) ; b = *( u + 4 ) ;
    tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) ,
		       SSE2_MUL( sm1 , b ) ) ;
    tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) ,
		       SSE2_MULCONJ( sm1 , a ) ) ;
    *( u + 1 ) = tmp0 ;
    *( u + 4 ) = tmp1 ;
    // third
    a = *( u + 2 ) ; b = *( u + 5 ) ;
    tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) ,
		       SSE2_MUL( sm1 , b ) ) ;
    tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) ,
		       SSE2_MULCONJ( sm1 , a ) ) ;
    *( u + 2 ) = tmp0 ;
    *( u + 5 ) = tmp1 ;
    break ;
  case 1 :
    // first one
    a = *( u + 3 ) ; b = *( u + 6 ) ;
    tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) ,
		       SSE2_MUL( sm1 , b ) ) ;
    tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) ,
		       SSE2_MULCONJ( sm1 , a ) ) ;
    *( u + 3 ) = tmp0 ;
    *( u + 6 ) = tmp1 ;
    // second one
    a = *( u + 4 ) ; b = *( u + 7 ) ;
    tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) ,
		       SSE2_MUL( sm1 , b ) ) ;
    tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) ,
		       SSE2_MULCONJ( sm1 , a ) ) ;
    *( u + 4 ) = tmp0 ;
    *( u + 7 ) = tmp1 ;
    // third
    a = *( u + 5 ) ; b = *( u + 8 ) ;
    tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) ,
		       SSE2_MUL( sm1 , b ) ) ;
    tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) ,
		       SSE2_MULCONJ( sm1 , a ) ) ;
    *( u + 5 ) = tmp0 ;
    *( u + 8 ) = tmp1 ;
    break ;
  case 2 :
    // first one
    a = *( u + 0 ) ; b = *( u + 6 ) ;
    tmp0 = _mm_sub_pd( SSE2_MULCONJ( sm0 , a ) ,
		       SSE2_MULCONJ( sm1 , b ) ) ;
    tmp1 = _mm_add_pd( SSE2_MUL( sm0 , b ) ,
		       SSE2_MUL( sm1 , a ) ) ;
    *( u + 0 ) = tmp0 ;
    *( u + 6 ) = tmp1 ;
    // second
    a = *( u + 1 ) ; b = *( u + 7 ) ;
    tmp0 = _mm_sub_pd( SSE2_MULCONJ( sm0 , a ) ,
		       SSE2_MULCONJ( sm1 , b ) ) ;
    tmp1 = _mm_add_pd( SSE2_MUL( sm0 , b ) ,
		       SSE2_MUL( sm1 , a ) ) ;
    *( u + 1 ) = tmp0 ;
    *( u + 7 ) = tmp1 ;
    // third
    a = *( u + 2 ) ; b = *( u + 8 ) ;
    tmp0 = _mm_sub_pd( SSE2_MULCONJ( sm0 , a ) ,
		       SSE2_MULCONJ( sm1 , b ) ) ;
    tmp1 = _mm_add_pd( SSE2_MUL( sm0 , b ) ,
		       SSE2_MUL( sm1 , a ) ) ;
    *( u + 2 ) = tmp0 ;
    *( u + 8 ) = tmp1 ;
    break ;
  }
#elif NC == 2
  __m128d *u = (__m128d*)U ;
  register const __m128d sm0 = _mm_setr_pd( creal( s0 ) , cimag( s0 ) ) ;
  register const __m128d sm1 = _mm_setr_pd( creal( s1 ) , cimag( s1 ) ) ;
  *( u + 0 ) = _mm_add_pd( SSE2_MUL( sm0 , *( u + 0 ) ) ,
			   SSE2_MUL( sm1 , *( u + 2 ) ) ) ;
  *( u + 1 ) = _mm_add_pd( SSE2_MUL( sm0 , *( u + 1 ) ) ,
			   SSE2_MUL( sm1 , *( u + 3 ) ) ) ;
  *( u + 2 ) = SSE_FLIP( SSE2_CONJ( *( u + 1 ) ) ) ; 
  *( u + 3 ) = SSE2_CONJ( *( u + 0 ) ) ;
#else
  // just a call to su2 multiply
  shortened_su2_multiply( U , s0 , s1 , su2_index ) ;
#endif
  return ;
}