void fft64_2way( void *a ) { __m256i* const A = a; register __m256i X0, X1, X2, X3, X4, X5, X6, X7; #define X(i) X##i X0 = A[0]; X1 = A[1]; X2 = A[2]; X3 = A[3]; X4 = A[4]; X5 = A[5]; X6 = A[6]; X7 = A[7]; #define DO_REDUCE(i) X(i) = REDUCE( X(i) ) // Begin with 8 parallels DIF FFT_8 // // FFT_8 using w=4 as 8th root of unity // Unrolled decimation in frequency (DIF) radix-2 NTT. // Output data is in revbin_permuted order. static const int w[] = {0, 2, 4, 6}; // __m256i *Twiddle = (__m256i*)FFT64_Twiddle; #define BUTTERFLY_0( i,j ) \ do { \ __m256i v = X(j); \ X(j) = _mm256_add_epi16( X(i), X(j) ); \ X(i) = _mm256_sub_epi16( X(i), v ); \ } while(0) #define BUTTERFLY_N( i,j,n ) \ do { \ __m256i v = X(j); \ X(j) = _mm256_add_epi16( X(i), X(j) ); \ X(i) = _mm256_slli_epi16( _mm256_sub_epi16( X(i), v ), w[n] ); \ } while(0) BUTTERFLY_0( 0, 4 ); BUTTERFLY_N( 1, 5, 1 ); BUTTERFLY_N( 2, 6, 2 ); BUTTERFLY_N( 3, 7, 3 ); DO_REDUCE( 2 ); DO_REDUCE( 3 ); BUTTERFLY_0( 0, 2 ); BUTTERFLY_0( 4, 6 ); BUTTERFLY_N( 1, 3, 2 ); BUTTERFLY_N( 5, 7, 2 ); DO_REDUCE( 1 ); BUTTERFLY_0( 0, 1 ); BUTTERFLY_0( 2, 3 ); BUTTERFLY_0( 4, 5 ); BUTTERFLY_0( 6, 7 ); /* We don't need to reduce X(7) */ DO_REDUCE_FULL_S( 0 ); DO_REDUCE_FULL_S( 1 ); DO_REDUCE_FULL_S( 2 ); DO_REDUCE_FULL_S( 3 ); DO_REDUCE_FULL_S( 4 ); DO_REDUCE_FULL_S( 5 ); DO_REDUCE_FULL_S( 6 ); #undef BUTTERFLY_0 #undef BUTTERFLY_N // Multiply by twiddle factors X(6) = _mm256_mullo_epi16( X(6), FFT64_Twiddle[0].m256i ); X(5) = _mm256_mullo_epi16( X(5), FFT64_Twiddle[1].m256i ); X(4) = _mm256_mullo_epi16( X(4), FFT64_Twiddle[2].m256i ); X(3) = _mm256_mullo_epi16( X(3), FFT64_Twiddle[3].m256i ); X(2) = _mm256_mullo_epi16( X(2), FFT64_Twiddle[4].m256i ); X(1) = _mm256_mullo_epi16( X(1), FFT64_Twiddle[5].m256i ); X(0) = _mm256_mullo_epi16( X(0), FFT64_Twiddle[6].m256i ); // Transpose the FFT state with a revbin order permutation // on the rows and the column. // This will make the full FFT_64 in order. #define INTERLEAVE(i,j) \ do { \ __m256i t1= X(i); \ __m256i t2= X(j); \ X(i) = _mm256_unpacklo_epi16( t1, t2 ); \ X(j) = _mm256_unpackhi_epi16( t1, t2 ); \ } while(0) INTERLEAVE( 1, 0 ); INTERLEAVE( 3, 2 ); INTERLEAVE( 5, 4 ); INTERLEAVE( 7, 6 ); INTERLEAVE( 2, 0 ); INTERLEAVE( 3, 1 ); INTERLEAVE( 6, 4 ); INTERLEAVE( 7, 5 ); INTERLEAVE( 4, 0 ); INTERLEAVE( 5, 1 ); INTERLEAVE( 6, 2 ); INTERLEAVE( 7, 3 ); #undef INTERLEAVE //Finish with 8 parallels DIT FFT_8 //FFT_8 using w=4 as 8th root of unity // Unrolled decimation in time (DIT) radix-2 NTT. // Input data is in revbin_permuted order. #define BUTTERFLY_0( i,j ) \ do { \ __m256i u = X(j); \ X(j) = _mm256_sub_epi16( X(j), X(i) ); \ X(i) = _mm256_add_epi16( u, X(i) ); \ } while(0) #define BUTTERFLY_N( i,j,n ) \ do { \ __m256i u = X(j); \ X(i) = _mm256_slli_epi16( X(i), w[n] ); \ X(j) = _mm256_sub_epi16( X(j), X(i) ); \ X(i) = _mm256_add_epi16( u, X(i) ); \ } while(0) DO_REDUCE( 0 ); DO_REDUCE( 1 ); DO_REDUCE( 2 ); DO_REDUCE( 3 ); DO_REDUCE( 4 ); DO_REDUCE( 5 ); DO_REDUCE( 6 ); DO_REDUCE( 7 ); BUTTERFLY_0( 0, 1 ); BUTTERFLY_0( 2, 3 ); BUTTERFLY_0( 4, 5 ); BUTTERFLY_0( 6, 7 ); BUTTERFLY_0( 0, 2 ); BUTTERFLY_0( 4, 6 ); BUTTERFLY_N( 1, 3, 2 ); BUTTERFLY_N( 5, 7, 2 ); DO_REDUCE( 3 ); BUTTERFLY_0( 0, 4 ); BUTTERFLY_N( 1, 5, 1 ); BUTTERFLY_N( 2, 6, 2 ); BUTTERFLY_N( 3, 7, 3 ); DO_REDUCE_FULL_S( 0 ); DO_REDUCE_FULL_S( 1 ); DO_REDUCE_FULL_S( 2 ); DO_REDUCE_FULL_S( 3 ); DO_REDUCE_FULL_S( 4 ); DO_REDUCE_FULL_S( 5 ); DO_REDUCE_FULL_S( 6 ); DO_REDUCE_FULL_S( 7 ); #undef BUTTERFLY A[0] = X0; A[1] = X1; A[2] = X2; A[3] = X3; A[4] = X4; A[5] = X5; A[6] = X6; A[7] = X7; #undef X }
void fft64(void *a) { v16* const A = a; register v16 X0, X1, X2, X3, X4, X5, X6, X7; #define X(i) X##i X0 = A[0]; X1 = A[1]; X2 = A[2]; X3 = A[3]; X4 = A[4]; X5 = A[5]; X6 = A[6]; X7 = A[7]; #define DO_REDUCE(i) \ X(i) = REDUCE(X(i)) /* * Begin with 8 parallels DIF FFT_8 * * FFT_8 using w=4 as 8th root of unity * Unrolled decimation in frequency (DIF) radix-2 NTT. * Output data is in revbin_permuted order. */ #define wn0 0 #define wn1 2 #define wn2 4 #define wn3 6 #define BUTTERFLY(i,j,n) \ do { \ v16 u= X(i); \ v16 v= X(j); \ X(i) = v16_add(u, v); \ if (n) \ X(j) = v16_shift_l(v16_sub(u, v), XCAT(wn,n)); \ else \ X(j) = v16_sub(u, v); \ } while(0) BUTTERFLY(0, 4, 0); BUTTERFLY(1, 5, 1); BUTTERFLY(2, 6, 2); BUTTERFLY(3, 7, 3); DO_REDUCE(5); DO_REDUCE(6); DO_REDUCE(7); BUTTERFLY(0, 2, 0); BUTTERFLY(4, 6, 0); BUTTERFLY(1, 3, 2); BUTTERFLY(5, 7, 2); BUTTERFLY(0, 1, 0); BUTTERFLY(2, 3, 0); BUTTERFLY(4, 5, 0); BUTTERFLY(6, 7, 0); /* We don't need to reduce X(0) */ DO_REDUCE_FULL_S(1); DO_REDUCE_FULL_S(2); DO_REDUCE_FULL_S(3); DO_REDUCE_FULL_S(4); DO_REDUCE_FULL_S(5); DO_REDUCE_FULL_S(6); DO_REDUCE_FULL_S(7); #undef BUTTERFLY /* * Multiply by twiddle factors */ X(1) = v16_mul(X(1), FFT64_Twiddle[0].v16); X(2) = v16_mul(X(2), FFT64_Twiddle[1].v16); X(3) = v16_mul(X(3), FFT64_Twiddle[2].v16); X(4) = v16_mul(X(4), FFT64_Twiddle[3].v16); X(5) = v16_mul(X(5), FFT64_Twiddle[4].v16); X(6) = v16_mul(X(6), FFT64_Twiddle[5].v16); X(7) = v16_mul(X(7), FFT64_Twiddle[6].v16); /* * Transpose the FFT state with a revbin order permutation * on the rows and the column. * This will make the full FFT_64 in order. */ #ifdef v16_interleave_inplace #define INTERLEAVE(i,j) v16_interleave_inplace(X(i), X(j)) #else #define INTERLEAVE(i,j) \ do { \ v16 t1= X(i); \ v16 t2= X(j); \ X(i) = v16_interleavel(t1, t2); \ X(j) = v16_interleaveh(t1, t2); \ } while(0) #endif INTERLEAVE(0, 1); INTERLEAVE(2, 3); INTERLEAVE(4, 5); INTERLEAVE(6, 7); INTERLEAVE(0, 2); INTERLEAVE(1, 3); INTERLEAVE(4, 6); INTERLEAVE(5, 7); INTERLEAVE(0, 4); INTERLEAVE(1, 5); INTERLEAVE(2, 6); INTERLEAVE(3, 7); #undef INTERLEAVE /* * Finish with 8 parallels DIT FFT_8 * * FFT_8 using w=4 as 8th root of unity * Unrolled decimation in time (DIT) radix-2 NTT. * Intput data is in revbin_permuted order. */ #define BUTTERFLY(i,j,n) \ do { \ v16 u= X(i); \ v16 v= X(j); \ if (n) \ v = v16_shift_l(v, XCAT(wn,n)); \ X(i) = v16_add(u, v); \ X(j) = v16_sub(u, v); \ } while(0) DO_REDUCE(0); DO_REDUCE(1); DO_REDUCE(2); DO_REDUCE(3); DO_REDUCE(4); DO_REDUCE(5); DO_REDUCE(6); DO_REDUCE(7); BUTTERFLY(0, 1, 0); BUTTERFLY(2, 3, 0); BUTTERFLY(4, 5, 0); BUTTERFLY(6, 7, 0); BUTTERFLY(0, 2, 0); BUTTERFLY(4, 6, 0); BUTTERFLY(1, 3, 2); BUTTERFLY(5, 7, 2); DO_REDUCE(7); BUTTERFLY(0, 4, 0); BUTTERFLY(1, 5, 1); BUTTERFLY(2, 6, 2); BUTTERFLY(3, 7, 3); DO_REDUCE_FULL_S(0); DO_REDUCE_FULL_S(1); DO_REDUCE_FULL_S(2); DO_REDUCE_FULL_S(3); DO_REDUCE_FULL_S(4); DO_REDUCE_FULL_S(5); DO_REDUCE_FULL_S(6); DO_REDUCE_FULL_S(7); #undef BUTTERFLY A[0] = X0; A[1] = X1; A[2] = X2; A[3] = X3; A[4] = X4; A[5] = X5; A[6] = X6; A[7] = X7; #undef X }