// compactified M.su(2)^{\dagger} multiply of, // // | M[col(a)] M[col(b)] | | a b |^{\dagger} // | M[col(a)+NC] M[col(b)+NC] | | c d | // | ..... ....... | // void shortened_su2_multiply_dag( GLU_complex *U , const GLU_complex a , const GLU_complex b , const size_t su2_index ) { // set A and b to be their conjugates register const __m128d A = _mm_setr_pd( creal( a ) , -cimag( a ) ) ; register const __m128d B = _mm_setr_pd( creal( b ) , -cimag( b ) ) ; //GLU_complex U1 , U2 ; // temporaries for caching const size_t col_a = (int)( Latt.su2_data[ su2_index ].idx_a % NC ) ; const size_t col_b = (int)( Latt.su2_data[ su2_index ].idx_b % NC ) ; register __m128d tmp ; __m128d *U1 = (__m128d*)( U + col_a ) ; __m128d *U2 = (__m128d*)( U + col_b ) ; size_t i ; for( i = 0 ; i < NC ; i++ ) { tmp = *U1 ; *U1 = _mm_add_pd( SSE2_MUL( tmp , A ) , SSE2_MUL( *U2 , B ) ) ; *U2 = _mm_sub_pd( SSE2_MUL_CONJ( *U2 , A ) , SSE2_MUL_CONJ( tmp , B ) ) ; U1 += NC ; U2 += NC ; } return ; }
// compactified (sparse matrix rep) su(2) multiply of, // // | a b || M[row(a)] M[row(a)++] .... | // | c d || M[row(c)] M[row(c)++] .... | // void shortened_su2_multiply( GLU_complex *w , const GLU_complex a , const GLU_complex b , const size_t su2_index ) { register const __m128d A = _mm_setr_pd( creal( a ) , cimag( a ) ) ; register const __m128d B = _mm_setr_pd( creal( b ) , cimag( b ) ) ; const size_t row_a = NC * (int)( Latt.su2_data[ su2_index ].idx_a / NC ) ; const size_t row_c = NC * (int)( Latt.su2_data[ su2_index ].idx_c / NC ) ; register __m128d tmp ; __m128d *w1 = (__m128d*)( w + row_a ) ; __m128d *w2 = (__m128d*)( w + row_c ) ; size_t i ; for( i = 0 ; i < NC ; i++ ) { tmp = *w1 ; *w1 = _mm_add_pd( SSE2_MUL( A , *w1 ) , SSE2_MUL( B , *w2 ) ) ; *w2 = _mm_sub_pd( SSE2_MULCONJ( A , *w2 ) , SSE2_MULCONJ( B , tmp ) ) ; w1++ , w2++ ; } return ; }
// column elimination static void eliminate_column( __m128d *a , __m128d *inverse , const __m128d fac , const size_t i , const size_t j ) { // common factor crops up everywhere register const __m128d fac1 = SSE2_MUL( a[ i + j*NC ] , fac ) ; __m128d *pA = ( a + i*(NC+1) + 1 ) , *ppA = ( a + i + 1 + j*NC ) ; size_t k ; // such a pattern elimintates cancelling zeros for( k = i + 1 ; k < NC ; k++ ) { *ppA = _mm_sub_pd( *ppA , SSE2_MUL( fac1 , *pA ) ) ; pA ++ , ppA++ ; } // whatever we do to a, we do to the identity pA = ( inverse + i*NC ) ; ppA = ( inverse + j*NC ) ; for( k = 0 ; k < NC ; k++ ) { *ppA = _mm_sub_pd( *ppA , SSE2_MUL( fac1 , *pA ) ) ; pA ++ , ppA++ ; } return ; }
// compute the real part of the product of 3 su(3) matrices double Re_trace_abc_dag_suNC( const GLU_complex a[ NCNC ] , const GLU_complex b[ NCNC ] , const GLU_complex c[ NCNC ] ) { const __m128d *A = ( const __m128d* )a ; const __m128d *B = ( const __m128d* )b ; const __m128d *C = ( const __m128d* )c ; double complex csum ; #if NC == 3 //const GLU_complex a0 = ( a[0] * b[0] + a[1] * b[3] + a[2] * b[6] ) ; const __m128d a0 = _mm_add_pd( SSE2_MUL( *( A + 0 ) , *( B + 0 ) ) , _mm_add_pd( SSE2_MUL( *( A + 1 ) , *( B + 3 ) ) , SSE2_MUL( *( A + 2 ) , *( B + 6 ) ) ) ) ; const __m128d a1 = _mm_add_pd( SSE2_MUL( *( A + 0 ) , *( B + 1 ) ) , _mm_add_pd( SSE2_MUL( *( A + 1 ) , *( B + 4 ) ) , SSE2_MUL( *( A + 2 ) , *( B + 7 ) ) ) ) ; const __m128d a2 = _mm_add_pd( SSE2_MUL( *( A + 0 ) , *( B + 2 ) ) , _mm_add_pd( SSE2_MUL( *( A + 1 ) , *( B + 5 ) ) , SSE2_MUL( *( A + 2 ) , *( B + 8 ) ) ) ) ; // second row const __m128d a3 = _mm_add_pd( SSE2_MUL( *( A + 3 ) , *( B + 0 ) ) , _mm_add_pd( SSE2_MUL( *( A + 4 ) , *( B + 3 ) ) , SSE2_MUL( *( A + 5 ) , *( B + 6 ) ) ) ) ; const __m128d a4 = _mm_add_pd( SSE2_MUL( *( A + 3 ) , *( B + 1 ) ) , _mm_add_pd( SSE2_MUL( *( A + 4 ) , *( B + 4 ) ) , SSE2_MUL( *( A + 5 ) , *( B + 7 ) ) ) ) ; const __m128d a5 = _mm_add_pd( SSE2_MUL( *( A + 3 ) , *( B + 2 ) ) , _mm_add_pd( SSE2_MUL( *( A + 4 ) , *( B + 5 ) ) , SSE2_MUL( *( A + 5 ) , *( B + 8 ) ) ) ) ; // last row is always a completion const __m128d a6 = SSE2_CONJ( _mm_sub_pd( SSE2_MUL( a1 , a5 ) , SSE2_MUL( a2 , a4 ) ) ) ; const __m128d a7 = SSE2_CONJ( _mm_sub_pd( SSE2_MUL( a2 , a3 ) , SSE2_MUL( a0 , a5 ) ) ) ; const __m128d a8 = SSE2_CONJ( _mm_sub_pd( SSE2_MUL( a0 , a4 ) , SSE2_MUL( a1 , a3 ) ) ) ; // and compute the real part of the trace register __m128d sum ; sum = SSE2_FMA( a0 , *( C+0 ) , SSE2_FMA( a1 , *( C+1 ) , _mm_mul_pd( a2 , *( C + 2 ) ) ) ) ; sum = SSE2_FMA( a3 , *( C + 3 ) , SSE2_FMA( a4 , *( C + 4 ) , sum ) ) ; sum = SSE2_FMA( a5 , *( C + 5 ) , SSE2_FMA( a6 , *( C + 6 ) , sum ) ) ; sum = SSE2_FMA( a7 , *( C + 7 ) , SSE2_FMA( a8 , *( C + 8 ) , sum ) ) ; _mm_store_pd( (void*)&csum , sum ) ; #elif NC == 2 // puts the four parts of the sum into the upper and lower parts of // two SSE-packed double words register const __m128d a0 = _mm_sub_pd( SSE2_MUL( *( A + 0 ) , *( B + 0 ) ) , SSE2_MUL_CONJ( *( A + 1 ) , *( B + 1 ) ) ) ; register const __m128d a1 = _mm_add_pd( SSE2_MUL( *( A + 0 ) , *( B + 1 ) ) , SSE2_MUL_CONJ( *( A + 1 ) , *( B + 0 ) ) ) ; register const __m128d sum = _mm_add_pd( _mm_mul_pd( a0 , *( C + 0 ) ) , _mm_mul_pd( a1 , *( C + 1 ) ) ) ; // multiply sum by 2 _mm_store_pd( (void*)&csum , _mm_add_pd( sum , sum ) ) ; #else register __m128d sum = _mm_setzero_pd( ) , insum ; size_t i , j , k ; for( i = 0 ; i < NC ; i++ ) { A = (const __m128d*)a ; for( j = 0 ; j < NC ; j++ ) { B = (const __m128d*)b ; insum = _mm_setzero_pd( ) ; for( k = 0 ; k < NC ; k++ ) { insum = _mm_add_pd( insum , SSE2_MUL( A[k] , B[i] ) ) ; B += NC ; } sum = _mm_add_pd( sum , _mm_mul_pd( insum , C[i+j*NC] ) ) ; // increment our pointers A += NC ; } } // multiply by 2 _mm_store_pd( (void*)&csum , sum ) ; #endif //printf( "%1.15f\n" , creal( csum ) + cimag( csum ) ) ; //exit(1) ; return creal( csum ) + cimag( csum ) ; }
} // Trace of the product of three matrices // void trace_abc( GLU_complex *__restrict tr , const GLU_complex a[ NCNC ] , const GLU_complex b[ NCNC ] , const GLU_complex c[ NCNC ] ) { const __m128d *A = ( const __m128d* )a ; const __m128d *B = ( const __m128d* )b ; const __m128d *C = ( const __m128d* )c ; double complex csum ; #if NC == 3 // first row const __m128d a0 = _mm_add_pd( SSE2_MUL( *( A + 0 ) , *( B + 0 ) ) , _mm_add_pd( SSE2_MUL( *( A + 1 ) , *( B + 3 ) ) , SSE2_MUL( *( A + 2 ) , *( B + 6 ) ) ) ) ; const __m128d a1 = _mm_add_pd( SSE2_MUL( *( A + 0 ) , *( B + 1 ) ) , _mm_add_pd( SSE2_MUL( *( A + 1 ) , *( B + 4 ) ) , SSE2_MUL( *( A + 2 ) , *( B + 7 ) ) ) ) ; const __m128d a2 = _mm_add_pd( SSE2_MUL( *( A + 0 ) , *( B + 2 ) ) , _mm_add_pd( SSE2_MUL( *( A + 1 ) , *( B + 5 ) ) , SSE2_MUL( *( A + 2 ) , *( B + 8 ) ) ) ) ; // second row const __m128d a3 = _mm_add_pd( SSE2_MUL( *( A + 3 ) , *( B + 0 ) ) , _mm_add_pd( SSE2_MUL( *( A + 4 ) , *( B + 3 ) ) , SSE2_MUL( *( A + 5 ) , *( B + 6 ) ) ) ) ; const __m128d a4 = _mm_add_pd( SSE2_MUL( *( A + 3 ) , *( B + 1 ) ) , _mm_add_pd( SSE2_MUL( *( A + 4 ) , *( B + 4 ) ) , SSE2_MUL( *( A + 5 ) , *( B + 7 ) ) ) ) ;
// rotate a matrix U = su2_i*U where su2_i is an su2 matrix embedded in suN void su2_rotate( GLU_complex U[ NCNC ] , const GLU_complex s0 , const GLU_complex s1 , const size_t su2_index ) { #if NC == 3 __m128d *u = (__m128d*)U ; register const __m128d sm0 = _mm_setr_pd( creal( s0 ) , cimag( s0 ) ) ; register const __m128d sm1 = _mm_setr_pd( creal( s1 ) , cimag( s1 ) ) ; register __m128d tmp0 , tmp1 , a , b ; switch( su2_index%3 ) { // again I don't like this case 0 : // first one a = *( u + 0 ) ; b = *( u + 3 ) ; tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) , SSE2_MUL( sm1 , b ) ) ; tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) , SSE2_MULCONJ( sm1 , a ) ) ; *( u + 0 ) = tmp0 ; *( u + 3 ) = tmp1 ; // second one a = *( u + 1 ) ; b = *( u + 4 ) ; tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) , SSE2_MUL( sm1 , b ) ) ; tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) , SSE2_MULCONJ( sm1 , a ) ) ; *( u + 1 ) = tmp0 ; *( u + 4 ) = tmp1 ; // third a = *( u + 2 ) ; b = *( u + 5 ) ; tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) , SSE2_MUL( sm1 , b ) ) ; tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) , SSE2_MULCONJ( sm1 , a ) ) ; *( u + 2 ) = tmp0 ; *( u + 5 ) = tmp1 ; break ; case 1 : // first one a = *( u + 3 ) ; b = *( u + 6 ) ; tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) , SSE2_MUL( sm1 , b ) ) ; tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) , SSE2_MULCONJ( sm1 , a ) ) ; *( u + 3 ) = tmp0 ; *( u + 6 ) = tmp1 ; // second one a = *( u + 4 ) ; b = *( u + 7 ) ; tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) , SSE2_MUL( sm1 , b ) ) ; tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) , SSE2_MULCONJ( sm1 , a ) ) ; *( u + 4 ) = tmp0 ; *( u + 7 ) = tmp1 ; // third a = *( u + 5 ) ; b = *( u + 8 ) ; tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) , SSE2_MUL( sm1 , b ) ) ; tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) , SSE2_MULCONJ( sm1 , a ) ) ; *( u + 5 ) = tmp0 ; *( u + 8 ) = tmp1 ; break ; case 2 : // first one a = *( u + 0 ) ; b = *( u + 6 ) ; tmp0 = _mm_sub_pd( SSE2_MULCONJ( sm0 , a ) , SSE2_MULCONJ( sm1 , b ) ) ; tmp1 = _mm_add_pd( SSE2_MUL( sm0 , b ) , SSE2_MUL( sm1 , a ) ) ; *( u + 0 ) = tmp0 ; *( u + 6 ) = tmp1 ; // second a = *( u + 1 ) ; b = *( u + 7 ) ; tmp0 = _mm_sub_pd( SSE2_MULCONJ( sm0 , a ) , SSE2_MULCONJ( sm1 , b ) ) ; tmp1 = _mm_add_pd( SSE2_MUL( sm0 , b ) , SSE2_MUL( sm1 , a ) ) ; *( u + 1 ) = tmp0 ; *( u + 7 ) = tmp1 ; // third a = *( u + 2 ) ; b = *( u + 8 ) ; tmp0 = _mm_sub_pd( SSE2_MULCONJ( sm0 , a ) , SSE2_MULCONJ( sm1 , b ) ) ; tmp1 = _mm_add_pd( SSE2_MUL( sm0 , b ) , SSE2_MUL( sm1 , a ) ) ; *( u + 2 ) = tmp0 ; *( u + 8 ) = tmp1 ; break ; } #elif NC == 2 __m128d *u = (__m128d*)U ; register const __m128d sm0 = _mm_setr_pd( creal( s0 ) , cimag( s0 ) ) ; register const __m128d sm1 = _mm_setr_pd( creal( s1 ) , cimag( s1 ) ) ; *( u + 0 ) = _mm_add_pd( SSE2_MUL( sm0 , *( u + 0 ) ) , SSE2_MUL( sm1 , *( u + 2 ) ) ) ; *( u + 1 ) = _mm_add_pd( SSE2_MUL( sm0 , *( u + 1 ) ) , SSE2_MUL( sm1 , *( u + 3 ) ) ) ; *( u + 2 ) = SSE_FLIP( SSE2_CONJ( *( u + 1 ) ) ) ; *( u + 3 ) = SSE2_CONJ( *( u + 0 ) ) ; #else // just a call to su2 multiply shortened_su2_multiply( U , s0 , s1 , su2_index ) ; #endif return ; }
static int gauss_jordan( GLU_complex M_1[ NCNC ] , const GLU_complex M[ NCNC ] ) { __m128d a[ NCNC ] GLUalign ; // temporary space to overwrite matrix register __m128d best , attempt , m1 , fac ; size_t i , j , piv ; // equate the necessary parts into double complex precision for( i = 0 ; i < NCNC ; i++ ) { a[ i ] = _mm_setr_pd( creal( M[i] ) , cimag( M[i] ) ) ; M_1[ i ] = ( i%(NC+1) ) ? 0.0 :1.0 ; } // set these pointers, pB will be the inverse __m128d *pB = (__m128d*)M_1 , *pA = (__m128d*)a ; // loop over diagonal of the square matrix M for( i = 0 ; i < NC-1 ; i++ ) { // column pivot by selecting the largest in magnitude value piv = i ; best = absfac( *( pA + i*(NC+1) ) ) ; for( j = i+1 ; j < NC ; j++ ) { attempt = absfac( *( pB + i + j*NC ) ) ; if( _mm_ucomilt_sd( best , attempt ) ) { piv = j ; best = attempt ; } } // if we must pivot then we do if( piv != i ) { swap_rows( pA , pB , piv , i ) ; } // perform gaussian elimination to obtain the upper triangular fac = _mm_div_pd( SSE2_CONJ( *( pA + i*(NC+1) ) ) , best ) ; for( j = NC-1 ; j > i ; j-- ) { // go up in other columns eliminate_column( pA , pB , fac , i , j ) ; } } // a is upper triangular, do the same for the upper half // no pivoting to be done here for( i = NC-1 ; i > 0 ; i-- ) { fac = SSE2_inverse( *( pA + i*(NC+1) ) ) ; for( j = 0 ; j < i ; j++ ) { eliminate_column( pA , pB , fac , i , j ) ; } } // multiply each row by its M_1 diagonal for( j = 0 ; j < NC ; j++ ) { m1 = SSE2_inverse( *pA ) ; for( i = 0 ; i < NC ; i++ ) { *pB = SSE2_MUL( *pB , m1 ) ; pB++ ; } pA += NC+1 ; } return GLU_SUCCESS ; }
void multab_suNC( GLU_complex a[ NCNC ] , const GLU_complex b[ NCNC ] , const GLU_complex c[ NCNC ] ) { // recast to alignment __m128d *A = (__m128d*)a ; const __m128d *B = (const __m128d*)b ; const __m128d *C = (const __m128d*)c ; #if NC == 3 // top row //a[0] = b[0] * c[0] + b[1] * c[3] + b[2] * c[6] ; *( A + 0 ) = _mm_add_pd( SSE2_MUL( *( B + 0 ) , *( C + 0 ) ) , _mm_add_pd( SSE2_MUL( *( B + 1 ) , *( C + 3 ) ) , SSE2_MUL( *( B + 2 ) , *( C + 6 ) ) ) ) ; // a[1] = b[0] * c[1] + b[1] * c[4] + b[2] * c[7] ; *( A + 1 ) = _mm_add_pd( SSE2_MUL( *( B + 0 ) , *( C + 1 ) ) , _mm_add_pd( SSE2_MUL( *( B + 1 ) , *( C + 4 ) ) , SSE2_MUL( *( B + 2 ) , *( C + 7 ) ) ) ) ; // a[2] = b[0] * c[2] + b[1] * c[5] + b[2] * c[8] ; *( A + 2 ) = _mm_add_pd( SSE2_MUL( *( B + 0 ) , *( C + 2 ) ) , _mm_add_pd( SSE2_MUL( *( B + 1 ) , *( C + 5 ) ) , SSE2_MUL( *( B + 2 ) , *( C + 8 ) ) ) ) ; // middle row // // a[3] = b[3] * c[0] + b[4] * c[3] + b[5] * c[6] ; *( A + 3 ) = _mm_add_pd( SSE2_MUL( *( B + 3 ) , *( C + 0 ) ) , _mm_add_pd( SSE2_MUL( *( B + 4 ) , *( C + 3 ) ) , SSE2_MUL( *( B + 5 ) , *( C + 6 ) ) ) ) ; // a[4] = b[3] * c[1] + b[4] * c[4] + b[5] * c[7] ; *( A + 4 ) = _mm_add_pd( SSE2_MUL( *( B + 3 ) , *( C + 1 ) ) , _mm_add_pd( SSE2_MUL( *( B + 4 ) , *( C + 4 ) ) , SSE2_MUL( *( B + 5 ) , *( C + 7 ) ) ) ) ; // a[5] = b[3] * c[2] + b[4] * c[5] + b[5] * c[8] ; *( A + 5 ) = _mm_add_pd( SSE2_MUL( *( B + 3 ) , *( C + 2 ) ) , _mm_add_pd( SSE2_MUL( *( B + 4 ) , *( C + 5 ) ) , SSE2_MUL( *( B + 5 ) , *( C + 8 ) ) ) ) ; // bottom row // as a completion of the top two // a[6] = conj( a[1] * a[5] - a[2] * a[4] ) ; *( A + 6 ) = SSE2_CONJ( _mm_sub_pd( SSE2_MUL( *( A + 1 ) , *( A + 5 ) ) , SSE2_MUL( *( A + 2 ) , *( A + 4 ) ) ) ) ; // a[7] = conj( a[2] * a[3] - a[0] * a[5] ) ; *( A + 7 ) = SSE2_CONJ( _mm_sub_pd( SSE2_MUL( *( A + 2 ) , *( A + 3 ) ) , SSE2_MUL( *( A + 0 ) , *( A + 5 ) ) ) ) ; // a[8] = conj( a[0] * a[4] - a[1] * a[3] ) ; *( A + 8 ) = SSE2_CONJ( _mm_sub_pd( SSE2_MUL( *( A + 0 ) , *( A + 4 ) ) , SSE2_MUL( *( A + 1 ) , *( A + 3 ) ) ) ) ; #elif NC == 2 *( A + 0 ) = _mm_add_pd( SSE2_MUL( *( B + 0 ) , *( C + 0 ) ) , SSE2_MUL( *( B + 1 ) , *( C + 2 ) ) ) ; *( A + 1 ) = _mm_add_pd( SSE2_MUL( *( B + 0 ) , *( C + 1 ) ) , SSE2_MUL( *( B + 1 ) , *( C + 3 ) ) ) ; // a[2] = -conj( a[1] ) *( A + 2 ) = SSE_FLIP( SSE2_CONJ( *( A + 1 ) ) ) ; // a[3] = conj( a[0] ) *( A + 3 ) = SSE2_CONJ( *( A + 0 ) ) ; #else return multab_suNC( a , b , c ) ; #endif return ; }