void multab_suNC( GLU_complex a[ NCNC ] , const GLU_complex b[ NCNC ] , const GLU_complex c[ NCNC ] ) { // recast to alignment __m128d *A = (__m128d*)a ; const __m128d *B = (const __m128d*)b ; const __m128d *C = (const __m128d*)c ; #if NC == 3 // top row //a[0] = b[0] * c[0] + b[1] * c[3] + b[2] * c[6] ; *( A + 0 ) = _mm_add_pd( SSE2_MUL( *( B + 0 ) , *( C + 0 ) ) , _mm_add_pd( SSE2_MUL( *( B + 1 ) , *( C + 3 ) ) , SSE2_MUL( *( B + 2 ) , *( C + 6 ) ) ) ) ; // a[1] = b[0] * c[1] + b[1] * c[4] + b[2] * c[7] ; *( A + 1 ) = _mm_add_pd( SSE2_MUL( *( B + 0 ) , *( C + 1 ) ) , _mm_add_pd( SSE2_MUL( *( B + 1 ) , *( C + 4 ) ) , SSE2_MUL( *( B + 2 ) , *( C + 7 ) ) ) ) ; // a[2] = b[0] * c[2] + b[1] * c[5] + b[2] * c[8] ; *( A + 2 ) = _mm_add_pd( SSE2_MUL( *( B + 0 ) , *( C + 2 ) ) , _mm_add_pd( SSE2_MUL( *( B + 1 ) , *( C + 5 ) ) , SSE2_MUL( *( B + 2 ) , *( C + 8 ) ) ) ) ; // middle row // // a[3] = b[3] * c[0] + b[4] * c[3] + b[5] * c[6] ; *( A + 3 ) = _mm_add_pd( SSE2_MUL( *( B + 3 ) , *( C + 0 ) ) , _mm_add_pd( SSE2_MUL( *( B + 4 ) , *( C + 3 ) ) , SSE2_MUL( *( B + 5 ) , *( C + 6 ) ) ) ) ; // a[4] = b[3] * c[1] + b[4] * c[4] + b[5] * c[7] ; *( A + 4 ) = _mm_add_pd( SSE2_MUL( *( B + 3 ) , *( C + 1 ) ) , _mm_add_pd( SSE2_MUL( *( B + 4 ) , *( C + 4 ) ) , SSE2_MUL( *( B + 5 ) , *( C + 7 ) ) ) ) ; // a[5] = b[3] * c[2] + b[4] * c[5] + b[5] * c[8] ; *( A + 5 ) = _mm_add_pd( SSE2_MUL( *( B + 3 ) , *( C + 2 ) ) , _mm_add_pd( SSE2_MUL( *( B + 4 ) , *( C + 5 ) ) , SSE2_MUL( *( B + 5 ) , *( C + 8 ) ) ) ) ; // bottom row // as a completion of the top two // a[6] = conj( a[1] * a[5] - a[2] * a[4] ) ; *( A + 6 ) = SSE2_CONJ( _mm_sub_pd( SSE2_MUL( *( A + 1 ) , *( A + 5 ) ) , SSE2_MUL( *( A + 2 ) , *( A + 4 ) ) ) ) ; // a[7] = conj( a[2] * a[3] - a[0] * a[5] ) ; *( A + 7 ) = SSE2_CONJ( _mm_sub_pd( SSE2_MUL( *( A + 2 ) , *( A + 3 ) ) , SSE2_MUL( *( A + 0 ) , *( A + 5 ) ) ) ) ; // a[8] = conj( a[0] * a[4] - a[1] * a[3] ) ; *( A + 8 ) = SSE2_CONJ( _mm_sub_pd( SSE2_MUL( *( A + 0 ) , *( A + 4 ) ) , SSE2_MUL( *( A + 1 ) , *( A + 3 ) ) ) ) ; #elif NC == 2 *( A + 0 ) = _mm_add_pd( SSE2_MUL( *( B + 0 ) , *( C + 0 ) ) , SSE2_MUL( *( B + 1 ) , *( C + 2 ) ) ) ; *( A + 1 ) = _mm_add_pd( SSE2_MUL( *( B + 0 ) , *( C + 1 ) ) , SSE2_MUL( *( B + 1 ) , *( C + 3 ) ) ) ; // a[2] = -conj( a[1] ) *( A + 2 ) = SSE_FLIP( SSE2_CONJ( *( A + 1 ) ) ) ; // a[3] = conj( a[0] ) *( A + 3 ) = SSE2_CONJ( *( A + 0 ) ) ; #else return multab_suNC( a , b , c ) ; #endif return ; }
// rotate a matrix U = su2_i*U where su2_i is an su2 matrix embedded in suN void su2_rotate( GLU_complex U[ NCNC ] , const GLU_complex s0 , const GLU_complex s1 , const size_t su2_index ) { #if NC == 3 __m128d *u = (__m128d*)U ; register const __m128d sm0 = _mm_setr_pd( creal( s0 ) , cimag( s0 ) ) ; register const __m128d sm1 = _mm_setr_pd( creal( s1 ) , cimag( s1 ) ) ; register __m128d tmp0 , tmp1 , a , b ; switch( su2_index%3 ) { // again I don't like this case 0 : // first one a = *( u + 0 ) ; b = *( u + 3 ) ; tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) , SSE2_MUL( sm1 , b ) ) ; tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) , SSE2_MULCONJ( sm1 , a ) ) ; *( u + 0 ) = tmp0 ; *( u + 3 ) = tmp1 ; // second one a = *( u + 1 ) ; b = *( u + 4 ) ; tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) , SSE2_MUL( sm1 , b ) ) ; tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) , SSE2_MULCONJ( sm1 , a ) ) ; *( u + 1 ) = tmp0 ; *( u + 4 ) = tmp1 ; // third a = *( u + 2 ) ; b = *( u + 5 ) ; tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) , SSE2_MUL( sm1 , b ) ) ; tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) , SSE2_MULCONJ( sm1 , a ) ) ; *( u + 2 ) = tmp0 ; *( u + 5 ) = tmp1 ; break ; case 1 : // first one a = *( u + 3 ) ; b = *( u + 6 ) ; tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) , SSE2_MUL( sm1 , b ) ) ; tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) , SSE2_MULCONJ( sm1 , a ) ) ; *( u + 3 ) = tmp0 ; *( u + 6 ) = tmp1 ; // second one a = *( u + 4 ) ; b = *( u + 7 ) ; tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) , SSE2_MUL( sm1 , b ) ) ; tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) , SSE2_MULCONJ( sm1 , a ) ) ; *( u + 4 ) = tmp0 ; *( u + 7 ) = tmp1 ; // third a = *( u + 5 ) ; b = *( u + 8 ) ; tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) , SSE2_MUL( sm1 , b ) ) ; tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) , SSE2_MULCONJ( sm1 , a ) ) ; *( u + 5 ) = tmp0 ; *( u + 8 ) = tmp1 ; break ; case 2 : // first one a = *( u + 0 ) ; b = *( u + 6 ) ; tmp0 = _mm_sub_pd( SSE2_MULCONJ( sm0 , a ) , SSE2_MULCONJ( sm1 , b ) ) ; tmp1 = _mm_add_pd( SSE2_MUL( sm0 , b ) , SSE2_MUL( sm1 , a ) ) ; *( u + 0 ) = tmp0 ; *( u + 6 ) = tmp1 ; // second a = *( u + 1 ) ; b = *( u + 7 ) ; tmp0 = _mm_sub_pd( SSE2_MULCONJ( sm0 , a ) , SSE2_MULCONJ( sm1 , b ) ) ; tmp1 = _mm_add_pd( SSE2_MUL( sm0 , b ) , SSE2_MUL( sm1 , a ) ) ; *( u + 1 ) = tmp0 ; *( u + 7 ) = tmp1 ; // third a = *( u + 2 ) ; b = *( u + 8 ) ; tmp0 = _mm_sub_pd( SSE2_MULCONJ( sm0 , a ) , SSE2_MULCONJ( sm1 , b ) ) ; tmp1 = _mm_add_pd( SSE2_MUL( sm0 , b ) , SSE2_MUL( sm1 , a ) ) ; *( u + 2 ) = tmp0 ; *( u + 8 ) = tmp1 ; break ; } #elif NC == 2 __m128d *u = (__m128d*)U ; register const __m128d sm0 = _mm_setr_pd( creal( s0 ) , cimag( s0 ) ) ; register const __m128d sm1 = _mm_setr_pd( creal( s1 ) , cimag( s1 ) ) ; *( u + 0 ) = _mm_add_pd( SSE2_MUL( sm0 , *( u + 0 ) ) , SSE2_MUL( sm1 , *( u + 2 ) ) ) ; *( u + 1 ) = _mm_add_pd( SSE2_MUL( sm0 , *( u + 1 ) ) , SSE2_MUL( sm1 , *( u + 3 ) ) ) ; *( u + 2 ) = SSE_FLIP( SSE2_CONJ( *( u + 1 ) ) ) ; *( u + 3 ) = SSE2_CONJ( *( u + 0 ) ) ; #else // just a call to su2 multiply shortened_su2_multiply( U , s0 , s1 , su2_index ) ; #endif return ; }