void a52_imdct_512 (sample_t * data, sample_t * delay, sample_t bias) { int i, k; sample_t t_r, t_i, a_r, a_i, b_r, b_i, w_1, w_2; const sample_t * window = a52_imdct_window; complex_t buf[128]; for (i = 0; i < 128; i++) { k = fftorder[i]; t_r = pre1[i].real; t_i = pre1[i].imag; BUTTERFLY_0 (buf[i].real, buf[i].imag, t_r, t_i, data[k], data[255-k]); } ifft128 (buf); /* Post IFFT complex multiply plus IFFT complex conjugate*/ /* Window and convert to real valued signal */ for (i = 0; i < 64; i++) { /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */ t_r = post1[i].real; t_i = post1[i].imag; BUTTERFLY_0 (a_r, a_i, t_i, t_r, buf[i].imag, buf[i].real); BUTTERFLY_0 (b_r, b_i, t_r, t_i, buf[127-i].imag, buf[127-i].real); w_1 = window[2*i]; w_2 = window[255-2*i]; BUTTERFLY_B (data[255-2*i], data[2*i], w_2, w_1, a_r, delay[2*i]); delay[2*i] = a_i; w_1 = window[2*i+1]; w_2 = window[254-2*i]; BUTTERFLY_B (data[2*i+1], data[254-2*i], w_1, w_2, b_r, delay[2*i+1]); delay[2*i+1] = b_i; } }
void a52_imdct_256 (sample_t * data, sample_t * delay, sample_t bias) { int i, k; sample_t t_r, t_i, a_r, a_i, b_r, b_i, c_r, c_i, d_r, d_i, w_1, w_2; const sample_t * window = a52_imdct_window; complex_t buf1[64], buf2[64]; /* Pre IFFT complex multiply plus IFFT cmplx conjugate */ for (i = 0; i < 64; i++) { k = fftorder[i]; t_r = pre2[i].real; t_i = pre2[i].imag; BUTTERFLY_0 (buf1[i].real, buf1[i].imag, t_r, t_i, data[k], data[254-k]); BUTTERFLY_0 (buf2[i].real, buf2[i].imag, t_r, t_i, data[k+1], data[255-k]); } ifft64 (buf1); ifft64 (buf2); /* Post IFFT complex multiply */ /* Window and convert to real valued signal */ for (i = 0; i < 32; i++) { /* y1[n] = z1[n] * (xcos2[n] + j * xs in2[n]) ; */ t_r = post2[i].real; t_i = post2[i].imag; BUTTERFLY_0 (a_r, a_i, t_i, t_r, buf1[i].imag, buf1[i].real); BUTTERFLY_0 (b_r, b_i, t_r, t_i, buf1[63-i].imag, buf1[63-i].real); BUTTERFLY_0 (c_r, c_i, t_i, t_r, buf2[i].imag, buf2[i].real); BUTTERFLY_0 (d_r, d_i, t_r, t_i, buf2[63-i].imag, buf2[63-i].real); w_1 = window[2*i]; w_2 = window[255-2*i]; BUTTERFLY_B (data[255-2*i], data[2*i], w_2, w_1, a_r, delay[2*i]); delay[2*i] = c_i; w_1 = window[128+2*i]; w_2 = window[127-2*i]; BUTTERFLY_B (data[128+2*i], data[127-2*i], w_1, w_2, a_i, delay[127-2*i]); delay[127-2*i] = c_r; w_1 = window[2*i+1]; w_2 = window[254-2*i]; BUTTERFLY_B (data[254-2*i], data[2*i+1], w_2, w_1, b_i, delay[2*i+1]); delay[2*i+1] = d_r; w_1 = window[129+2*i]; w_2 = window[126-2*i]; BUTTERFLY_B (data[129+2*i], data[126-2*i], w_1, w_2, b_r, delay[126-2*i]); delay[126-2*i] = d_i; } }
void fft64_2way( void *a ) { __m256i* const A = a; register __m256i X0, X1, X2, X3, X4, X5, X6, X7; #define X(i) X##i X0 = A[0]; X1 = A[1]; X2 = A[2]; X3 = A[3]; X4 = A[4]; X5 = A[5]; X6 = A[6]; X7 = A[7]; #define DO_REDUCE(i) X(i) = REDUCE( X(i) ) // Begin with 8 parallels DIF FFT_8 // // FFT_8 using w=4 as 8th root of unity // Unrolled decimation in frequency (DIF) radix-2 NTT. // Output data is in revbin_permuted order. static const int w[] = {0, 2, 4, 6}; // __m256i *Twiddle = (__m256i*)FFT64_Twiddle; #define BUTTERFLY_0( i,j ) \ do { \ __m256i v = X(j); \ X(j) = _mm256_add_epi16( X(i), X(j) ); \ X(i) = _mm256_sub_epi16( X(i), v ); \ } while(0) #define BUTTERFLY_N( i,j,n ) \ do { \ __m256i v = X(j); \ X(j) = _mm256_add_epi16( X(i), X(j) ); \ X(i) = _mm256_slli_epi16( _mm256_sub_epi16( X(i), v ), w[n] ); \ } while(0) BUTTERFLY_0( 0, 4 ); BUTTERFLY_N( 1, 5, 1 ); BUTTERFLY_N( 2, 6, 2 ); BUTTERFLY_N( 3, 7, 3 ); DO_REDUCE( 2 ); DO_REDUCE( 3 ); BUTTERFLY_0( 0, 2 ); BUTTERFLY_0( 4, 6 ); BUTTERFLY_N( 1, 3, 2 ); BUTTERFLY_N( 5, 7, 2 ); DO_REDUCE( 1 ); BUTTERFLY_0( 0, 1 ); BUTTERFLY_0( 2, 3 ); BUTTERFLY_0( 4, 5 ); BUTTERFLY_0( 6, 7 ); /* We don't need to reduce X(7) */ DO_REDUCE_FULL_S( 0 ); DO_REDUCE_FULL_S( 1 ); DO_REDUCE_FULL_S( 2 ); DO_REDUCE_FULL_S( 3 ); DO_REDUCE_FULL_S( 4 ); DO_REDUCE_FULL_S( 5 ); DO_REDUCE_FULL_S( 6 ); #undef BUTTERFLY_0 #undef BUTTERFLY_N // Multiply by twiddle factors X(6) = _mm256_mullo_epi16( X(6), FFT64_Twiddle[0].m256i ); X(5) = _mm256_mullo_epi16( X(5), FFT64_Twiddle[1].m256i ); X(4) = _mm256_mullo_epi16( X(4), FFT64_Twiddle[2].m256i ); X(3) = _mm256_mullo_epi16( X(3), FFT64_Twiddle[3].m256i ); X(2) = _mm256_mullo_epi16( X(2), FFT64_Twiddle[4].m256i ); X(1) = _mm256_mullo_epi16( X(1), FFT64_Twiddle[5].m256i ); X(0) = _mm256_mullo_epi16( X(0), FFT64_Twiddle[6].m256i ); // Transpose the FFT state with a revbin order permutation // on the rows and the column. // This will make the full FFT_64 in order. #define INTERLEAVE(i,j) \ do { \ __m256i t1= X(i); \ __m256i t2= X(j); \ X(i) = _mm256_unpacklo_epi16( t1, t2 ); \ X(j) = _mm256_unpackhi_epi16( t1, t2 ); \ } while(0) INTERLEAVE( 1, 0 ); INTERLEAVE( 3, 2 ); INTERLEAVE( 5, 4 ); INTERLEAVE( 7, 6 ); INTERLEAVE( 2, 0 ); INTERLEAVE( 3, 1 ); INTERLEAVE( 6, 4 ); INTERLEAVE( 7, 5 ); INTERLEAVE( 4, 0 ); INTERLEAVE( 5, 1 ); INTERLEAVE( 6, 2 ); INTERLEAVE( 7, 3 ); #undef INTERLEAVE //Finish with 8 parallels DIT FFT_8 //FFT_8 using w=4 as 8th root of unity // Unrolled decimation in time (DIT) radix-2 NTT. // Input data is in revbin_permuted order. #define BUTTERFLY_0( i,j ) \ do { \ __m256i u = X(j); \ X(j) = _mm256_sub_epi16( X(j), X(i) ); \ X(i) = _mm256_add_epi16( u, X(i) ); \ } while(0) #define BUTTERFLY_N( i,j,n ) \ do { \ __m256i u = X(j); \ X(i) = _mm256_slli_epi16( X(i), w[n] ); \ X(j) = _mm256_sub_epi16( X(j), X(i) ); \ X(i) = _mm256_add_epi16( u, X(i) ); \ } while(0) DO_REDUCE( 0 ); DO_REDUCE( 1 ); DO_REDUCE( 2 ); DO_REDUCE( 3 ); DO_REDUCE( 4 ); DO_REDUCE( 5 ); DO_REDUCE( 6 ); DO_REDUCE( 7 ); BUTTERFLY_0( 0, 1 ); BUTTERFLY_0( 2, 3 ); BUTTERFLY_0( 4, 5 ); BUTTERFLY_0( 6, 7 ); BUTTERFLY_0( 0, 2 ); BUTTERFLY_0( 4, 6 ); BUTTERFLY_N( 1, 3, 2 ); BUTTERFLY_N( 5, 7, 2 ); DO_REDUCE( 3 ); BUTTERFLY_0( 0, 4 ); BUTTERFLY_N( 1, 5, 1 ); BUTTERFLY_N( 2, 6, 2 ); BUTTERFLY_N( 3, 7, 3 ); DO_REDUCE_FULL_S( 0 ); DO_REDUCE_FULL_S( 1 ); DO_REDUCE_FULL_S( 2 ); DO_REDUCE_FULL_S( 3 ); DO_REDUCE_FULL_S( 4 ); DO_REDUCE_FULL_S( 5 ); DO_REDUCE_FULL_S( 6 ); DO_REDUCE_FULL_S( 7 ); #undef BUTTERFLY A[0] = X0; A[1] = X1; A[2] = X2; A[3] = X3; A[4] = X4; A[5] = X5; A[6] = X6; A[7] = X7; #undef X }