예제 #1
0
void fft64_2way( void *a )
{
  __m256i* const A = a;
  register __m256i X0, X1, X2, X3, X4, X5, X6, X7;

#define X(i) X##i

  X0 = A[0];
  X1 = A[1];
  X2 = A[2];
  X3 = A[3];
  X4 = A[4];
  X5 = A[5];
  X6 = A[6];
  X7 = A[7];

#define DO_REDUCE(i)   X(i) = REDUCE( X(i) )

   // Begin with 8 parallels DIF FFT_8
   //
   // FFT_8 using w=4 as 8th root of unity
   //  Unrolled decimation in frequency (DIF) radix-2 NTT.
   //  Output data is in revbin_permuted order.

  static const int w[] = {0, 2, 4, 6};
//   __m256i *Twiddle = (__m256i*)FFT64_Twiddle;


#define BUTTERFLY_0( i,j ) \
do { \
    __m256i v = X(j); \
    X(j) = _mm256_add_epi16( X(i), X(j) ); \
    X(i) = _mm256_sub_epi16( X(i), v ); \
} while(0)

#define BUTTERFLY_N( i,j,n ) \
do { \
    __m256i v = X(j); \
    X(j) = _mm256_add_epi16( X(i), X(j) ); \
    X(i) = _mm256_slli_epi16( _mm256_sub_epi16( X(i), v ), w[n] ); \
} while(0)

  BUTTERFLY_0( 0, 4 );
  BUTTERFLY_N( 1, 5, 1 );
  BUTTERFLY_N( 2, 6, 2 );
  BUTTERFLY_N( 3, 7, 3 );

  DO_REDUCE( 2 );
  DO_REDUCE( 3 );

  BUTTERFLY_0( 0, 2 );
  BUTTERFLY_0( 4, 6 );
  BUTTERFLY_N( 1, 3, 2 );
  BUTTERFLY_N( 5, 7, 2 );

  DO_REDUCE( 1 );

  BUTTERFLY_0( 0, 1 );
  BUTTERFLY_0( 2, 3 );
  BUTTERFLY_0( 4, 5 );
  BUTTERFLY_0( 6, 7 );

  /* We don't need to reduce X(7) */
  DO_REDUCE_FULL_S( 0 );
  DO_REDUCE_FULL_S( 1 );
  DO_REDUCE_FULL_S( 2 );
  DO_REDUCE_FULL_S( 3 );
  DO_REDUCE_FULL_S( 4 );
  DO_REDUCE_FULL_S( 5 );
  DO_REDUCE_FULL_S( 6 );

#undef BUTTERFLY_0
#undef BUTTERFLY_N

  // Multiply by twiddle factors
  X(6) = _mm256_mullo_epi16( X(6), FFT64_Twiddle[0].m256i );
  X(5) = _mm256_mullo_epi16( X(5), FFT64_Twiddle[1].m256i );
  X(4) = _mm256_mullo_epi16( X(4), FFT64_Twiddle[2].m256i );
  X(3) = _mm256_mullo_epi16( X(3), FFT64_Twiddle[3].m256i );
  X(2) = _mm256_mullo_epi16( X(2), FFT64_Twiddle[4].m256i );
  X(1) = _mm256_mullo_epi16( X(1), FFT64_Twiddle[5].m256i );
  X(0) = _mm256_mullo_epi16( X(0), FFT64_Twiddle[6].m256i );

  // Transpose the FFT state with a revbin order permutation
  // on the rows and the column.
  // This will make the full FFT_64 in order.
#define INTERLEAVE(i,j) \
  do { \
    __m256i t1= X(i); \
    __m256i t2= X(j); \
    X(i) = _mm256_unpacklo_epi16( t1, t2 ); \
    X(j) = _mm256_unpackhi_epi16( t1, t2 ); \
  } while(0)

  INTERLEAVE( 1, 0 );
  INTERLEAVE( 3, 2 );
  INTERLEAVE( 5, 4 );
  INTERLEAVE( 7, 6 );

  INTERLEAVE( 2, 0 );
  INTERLEAVE( 3, 1 );
  INTERLEAVE( 6, 4 );
  INTERLEAVE( 7, 5 );

  INTERLEAVE( 4, 0 );
  INTERLEAVE( 5, 1 );
  INTERLEAVE( 6, 2 );
  INTERLEAVE( 7, 3 );

#undef INTERLEAVE

   //Finish with 8 parallels DIT FFT_8
   //FFT_8 using w=4 as 8th root of unity
   // Unrolled decimation in time (DIT) radix-2 NTT.
   // Input data is in revbin_permuted order.

#define BUTTERFLY_0( i,j ) \
do { \
   __m256i u = X(j); \
   X(j) = _mm256_sub_epi16( X(j), X(i) ); \
   X(i) = _mm256_add_epi16( u, X(i) ); \
} while(0)


#define BUTTERFLY_N( i,j,n ) \
do { \
   __m256i u = X(j); \
   X(i) = _mm256_slli_epi16( X(i), w[n] ); \
   X(j) = _mm256_sub_epi16( X(j), X(i) ); \
   X(i) = _mm256_add_epi16( u, X(i) ); \
} while(0)

  DO_REDUCE( 0 );
  DO_REDUCE( 1 );
  DO_REDUCE( 2 );
  DO_REDUCE( 3 );
  DO_REDUCE( 4 );
  DO_REDUCE( 5 );
  DO_REDUCE( 6 );
  DO_REDUCE( 7 );

  BUTTERFLY_0( 0, 1 );
  BUTTERFLY_0( 2, 3 );
  BUTTERFLY_0( 4, 5 );
  BUTTERFLY_0( 6, 7 );

  BUTTERFLY_0( 0, 2 );
  BUTTERFLY_0( 4, 6 );
  BUTTERFLY_N( 1, 3, 2 );
  BUTTERFLY_N( 5, 7, 2 );

  DO_REDUCE( 3 );

  BUTTERFLY_0( 0, 4 );
  BUTTERFLY_N( 1, 5, 1 );
  BUTTERFLY_N( 2, 6, 2 );
  BUTTERFLY_N( 3, 7, 3 );

  DO_REDUCE_FULL_S( 0 );
  DO_REDUCE_FULL_S( 1 );
  DO_REDUCE_FULL_S( 2 );
  DO_REDUCE_FULL_S( 3 );
  DO_REDUCE_FULL_S( 4 );
  DO_REDUCE_FULL_S( 5 );
  DO_REDUCE_FULL_S( 6 );
  DO_REDUCE_FULL_S( 7 );

#undef BUTTERFLY

  A[0] = X0;
  A[1] = X1;
  A[2] = X2;
  A[3] = X3;
  A[4] = X4;
  A[5] = X5;
  A[6] = X6;
  A[7] = X7;

#undef X
}
예제 #2
0
void fft64(void *a) {

  v16* const A = a;

  register v16 X0, X1, X2, X3, X4, X5, X6, X7;

#define X(i) X##i

  X0 = A[0];
  X1 = A[1];
  X2 = A[2];
  X3 = A[3];
  X4 = A[4];
  X5 = A[5];
  X6 = A[6];
  X7 = A[7];

#define DO_REDUCE(i)                            \
  X(i) = REDUCE(X(i))

  /*
   * Begin with 8 parallels DIF FFT_8
   *
   * FFT_8 using w=4 as 8th root of unity
   *  Unrolled decimation in frequency (DIF) radix-2 NTT.
   *  Output data is in revbin_permuted order.
   */

  #define wn0 0
  #define wn1 2
  #define wn2 4
  #define wn3 6

#define BUTTERFLY(i,j,n)                                \
  do {                                                  \
    v16 u= X(i);                                        \
    v16 v= X(j);                                        \
    X(i) =  v16_add(u, v);                              \
    if (n)                                              \
      X(j) = v16_shift_l(v16_sub(u, v), XCAT(wn,n));	\
    else                                                \
      X(j) = v16_sub(u, v);                             \
  } while(0)

  BUTTERFLY(0, 4, 0);
  BUTTERFLY(1, 5, 1);
  BUTTERFLY(2, 6, 2);
  BUTTERFLY(3, 7, 3);
  
  DO_REDUCE(5);
  DO_REDUCE(6);
  DO_REDUCE(7);
  
  BUTTERFLY(0, 2, 0);
  BUTTERFLY(4, 6, 0);
  BUTTERFLY(1, 3, 2);
  BUTTERFLY(5, 7, 2);
  
  BUTTERFLY(0, 1, 0);
  BUTTERFLY(2, 3, 0);
  BUTTERFLY(4, 5, 0);
  BUTTERFLY(6, 7, 0);
  
  /* We don't need to reduce X(0) */
  DO_REDUCE_FULL_S(1);
  DO_REDUCE_FULL_S(2);
  DO_REDUCE_FULL_S(3);
  DO_REDUCE_FULL_S(4);
  DO_REDUCE_FULL_S(5);
  DO_REDUCE_FULL_S(6);
  DO_REDUCE_FULL_S(7);
    
#undef BUTTERFLY

  /*
   * Multiply by twiddle factors
   */

  X(1) = v16_mul(X(1), FFT64_Twiddle[0].v16);
  X(2) = v16_mul(X(2), FFT64_Twiddle[1].v16);
  X(3) = v16_mul(X(3), FFT64_Twiddle[2].v16);
  X(4) = v16_mul(X(4), FFT64_Twiddle[3].v16);
  X(5) = v16_mul(X(5), FFT64_Twiddle[4].v16);
  X(6) = v16_mul(X(6), FFT64_Twiddle[5].v16);
  X(7) = v16_mul(X(7), FFT64_Twiddle[6].v16);

  /*
   * Transpose the FFT state with a revbin order permutation
   * on the rows and the column.
   * This will make the full FFT_64 in order.
   */

#ifdef v16_interleave_inplace
#define INTERLEAVE(i,j) v16_interleave_inplace(X(i), X(j))
#else
#define INTERLEAVE(i,j)                          \
  do {                                           \
    v16 t1= X(i);                                \
    v16 t2= X(j);                                \
    X(i) = v16_interleavel(t1, t2);              \
    X(j) = v16_interleaveh(t1, t2);              \
  } while(0)
#endif

  INTERLEAVE(0, 1);
  INTERLEAVE(2, 3);
  INTERLEAVE(4, 5);
  INTERLEAVE(6, 7);

  INTERLEAVE(0, 2);
  INTERLEAVE(1, 3);
  INTERLEAVE(4, 6);
  INTERLEAVE(5, 7);

  INTERLEAVE(0, 4);
  INTERLEAVE(1, 5);
  INTERLEAVE(2, 6);
  INTERLEAVE(3, 7);

#undef INTERLEAVE

  /*
   * Finish with 8 parallels DIT FFT_8
   *
   * FFT_8 using w=4 as 8th root of unity
   *  Unrolled decimation in time (DIT) radix-2 NTT.
   *  Intput data is in revbin_permuted order.
   */
  
#define BUTTERFLY(i,j,n)                                \
  do {                                                  \
    v16 u= X(i);                                        \
    v16 v= X(j);                                        \
    if (n)                                              \
      v = v16_shift_l(v, XCAT(wn,n));			\
    X(i) = v16_add(u, v);                               \
    X(j) = v16_sub(u, v);                               \
  } while(0)

  DO_REDUCE(0);
  DO_REDUCE(1);
  DO_REDUCE(2);
  DO_REDUCE(3);
  DO_REDUCE(4);
  DO_REDUCE(5);
  DO_REDUCE(6);
  DO_REDUCE(7);
  
  BUTTERFLY(0, 1, 0);
  BUTTERFLY(2, 3, 0);
  BUTTERFLY(4, 5, 0);
  BUTTERFLY(6, 7, 0);
  
  BUTTERFLY(0, 2, 0);
  BUTTERFLY(4, 6, 0);
  BUTTERFLY(1, 3, 2);
  BUTTERFLY(5, 7, 2);
  
  DO_REDUCE(7);
  
  BUTTERFLY(0, 4, 0);
  BUTTERFLY(1, 5, 1);
  BUTTERFLY(2, 6, 2);
  BUTTERFLY(3, 7, 3);
  
  DO_REDUCE_FULL_S(0);
  DO_REDUCE_FULL_S(1);
  DO_REDUCE_FULL_S(2);
  DO_REDUCE_FULL_S(3);
  DO_REDUCE_FULL_S(4);
  DO_REDUCE_FULL_S(5);
  DO_REDUCE_FULL_S(6);
  DO_REDUCE_FULL_S(7);
  
#undef BUTTERFLY

  A[0] = X0;
  A[1] = X1;
  A[2] = X2;
  A[3] = X3;
  A[4] = X4;
  A[5] = X5;
  A[6] = X6;
  A[7] = X7;

#undef X

}