コード例 #1
0
ファイル: tae.c プロジェクト: 0x64616E69656C/supercop
void read128_with_checksum(const uint8_t *in8, v16qi *out128, v16qi *cksum, int len) {
  // Read data
  register v16qi
    X0  = sse_load(in8+16*0 ),
    X1  = sse_load(in8+16*1 ),
    X2  = sse_load(in8+16*2 ),
    X3  = sse_load(in8+16*3 ),
    X4  = sse_load(in8+16*4 ),
    X5  = sse_load(in8+16*5 ),
    X6  = sse_load(in8+16*6 ),
    X7  = sse_load(in8+16*7 ),
    X8  = sse_load(in8+16*8 ),
    X9  = sse_load(in8+16*9 ),
    X10 = sse_load(in8+16*10),
    X11 = sse_load(in8+16*11),
    X12 = sse_load(in8+16*12),
    X13 = sse_load(in8+16*13),
    X14 = sse_load(in8+16*14),
    X15 = sse_load(in8+16*15);

#define X(i) X##i

  // Update checksum
  if (cksum) {
    switch (len) {
    case 16:
      *cksum ^= X(15);
    case 15:
      *cksum ^= X(14);
    case 14:
      *cksum ^= X(13);
    case 13:
      *cksum ^= X(12);
    case 12:
      *cksum ^= X(11);
    case 11:
      *cksum ^= X(10);
    case 10:
      *cksum ^= X(9 );
    case 9:
      *cksum ^= X(8 );
    case 8:
      *cksum ^= X(7 );
    case 7:
      *cksum ^= X(6 );
    case 6:
      *cksum ^= X(5 );
    case 5:
      *cksum ^= X(4 );
    case 4:
      *cksum ^= X(3 );
    case 3:
      *cksum ^= X(2 );
    case 2:
      *cksum ^= X(1 );
    case 1:
      *cksum ^= X(0 );
    case 0:
      ;
    }
  }
  
  // Transpose matrix
#define INTERLEAVE(i,j)					     \
  do {							     \
    v16qi t1= X(i);					     \
    v16qi t2= X(j);					     \
    X(i) = sse_interleavel(t1, t2);                          \
    X(j) = sse_interleaveh(t1, t2);                          \
  } while(0)
  

  INTERLEAVE( 0,  8);
  INTERLEAVE( 1,  9);
  INTERLEAVE( 2, 10);
  INTERLEAVE( 3, 11);
  INTERLEAVE( 4, 12);
  INTERLEAVE( 5, 13);
  INTERLEAVE( 6, 14);
  INTERLEAVE( 7, 15);

  INTERLEAVE( 0,  4);
  INTERLEAVE( 1,  5);
  INTERLEAVE( 2,  6);
  INTERLEAVE( 3,  7);
  INTERLEAVE( 8, 12);
  INTERLEAVE( 9, 13);
  INTERLEAVE(10, 14);
  INTERLEAVE(11, 15);

  INTERLEAVE( 0,  2);
  INTERLEAVE( 1,  3);
  INTERLEAVE( 4,  6);
  INTERLEAVE( 5,  7);
  INTERLEAVE( 8, 10);
  INTERLEAVE( 9, 11);
  INTERLEAVE(12, 14);
  INTERLEAVE(13, 15);

  INTERLEAVE( 0,  1);
  INTERLEAVE( 2,  3);
  INTERLEAVE( 4,  5);
  INTERLEAVE( 6,  7);
  INTERLEAVE( 8,  9);
  INTERLEAVE(10, 11);
  INTERLEAVE(12, 13);
  INTERLEAVE(14, 15);

  // Write data
  out128[0 ] = X(0 );
  out128[1 ] = X(2 );
  out128[2 ] = X(4 );
  out128[3 ] = X(6 );
  out128[4 ] = X(8 );
  out128[5 ] = X(10);
  out128[6 ] = X(12);
  out128[7 ] = X(14);
  out128[8 ] = X(1 );
  out128[9 ] = X(3 );
  out128[10] = X(5 );
  out128[11] = X(7 );
  out128[12] = X(9 );
  out128[13] = X(11);
  out128[14] = X(13);
  out128[15] = X(15);

#undef X
#undef INTERLEAVE
}
コード例 #2
0
void fft64_2way( void *a )
{
  __m256i* const A = a;
  register __m256i X0, X1, X2, X3, X4, X5, X6, X7;

#define X(i) X##i

  X0 = A[0];
  X1 = A[1];
  X2 = A[2];
  X3 = A[3];
  X4 = A[4];
  X5 = A[5];
  X6 = A[6];
  X7 = A[7];

#define DO_REDUCE(i)   X(i) = REDUCE( X(i) )

   // Begin with 8 parallels DIF FFT_8
   //
   // FFT_8 using w=4 as 8th root of unity
   //  Unrolled decimation in frequency (DIF) radix-2 NTT.
   //  Output data is in revbin_permuted order.

  static const int w[] = {0, 2, 4, 6};
//   __m256i *Twiddle = (__m256i*)FFT64_Twiddle;


#define BUTTERFLY_0( i,j ) \
do { \
    __m256i v = X(j); \
    X(j) = _mm256_add_epi16( X(i), X(j) ); \
    X(i) = _mm256_sub_epi16( X(i), v ); \
} while(0)

#define BUTTERFLY_N( i,j,n ) \
do { \
    __m256i v = X(j); \
    X(j) = _mm256_add_epi16( X(i), X(j) ); \
    X(i) = _mm256_slli_epi16( _mm256_sub_epi16( X(i), v ), w[n] ); \
} while(0)

  BUTTERFLY_0( 0, 4 );
  BUTTERFLY_N( 1, 5, 1 );
  BUTTERFLY_N( 2, 6, 2 );
  BUTTERFLY_N( 3, 7, 3 );

  DO_REDUCE( 2 );
  DO_REDUCE( 3 );

  BUTTERFLY_0( 0, 2 );
  BUTTERFLY_0( 4, 6 );
  BUTTERFLY_N( 1, 3, 2 );
  BUTTERFLY_N( 5, 7, 2 );

  DO_REDUCE( 1 );

  BUTTERFLY_0( 0, 1 );
  BUTTERFLY_0( 2, 3 );
  BUTTERFLY_0( 4, 5 );
  BUTTERFLY_0( 6, 7 );

  /* We don't need to reduce X(7) */
  DO_REDUCE_FULL_S( 0 );
  DO_REDUCE_FULL_S( 1 );
  DO_REDUCE_FULL_S( 2 );
  DO_REDUCE_FULL_S( 3 );
  DO_REDUCE_FULL_S( 4 );
  DO_REDUCE_FULL_S( 5 );
  DO_REDUCE_FULL_S( 6 );

#undef BUTTERFLY_0
#undef BUTTERFLY_N

  // Multiply by twiddle factors
  X(6) = _mm256_mullo_epi16( X(6), FFT64_Twiddle[0].m256i );
  X(5) = _mm256_mullo_epi16( X(5), FFT64_Twiddle[1].m256i );
  X(4) = _mm256_mullo_epi16( X(4), FFT64_Twiddle[2].m256i );
  X(3) = _mm256_mullo_epi16( X(3), FFT64_Twiddle[3].m256i );
  X(2) = _mm256_mullo_epi16( X(2), FFT64_Twiddle[4].m256i );
  X(1) = _mm256_mullo_epi16( X(1), FFT64_Twiddle[5].m256i );
  X(0) = _mm256_mullo_epi16( X(0), FFT64_Twiddle[6].m256i );

  // Transpose the FFT state with a revbin order permutation
  // on the rows and the column.
  // This will make the full FFT_64 in order.
#define INTERLEAVE(i,j) \
  do { \
    __m256i t1= X(i); \
    __m256i t2= X(j); \
    X(i) = _mm256_unpacklo_epi16( t1, t2 ); \
    X(j) = _mm256_unpackhi_epi16( t1, t2 ); \
  } while(0)

  INTERLEAVE( 1, 0 );
  INTERLEAVE( 3, 2 );
  INTERLEAVE( 5, 4 );
  INTERLEAVE( 7, 6 );

  INTERLEAVE( 2, 0 );
  INTERLEAVE( 3, 1 );
  INTERLEAVE( 6, 4 );
  INTERLEAVE( 7, 5 );

  INTERLEAVE( 4, 0 );
  INTERLEAVE( 5, 1 );
  INTERLEAVE( 6, 2 );
  INTERLEAVE( 7, 3 );

#undef INTERLEAVE

   //Finish with 8 parallels DIT FFT_8
   //FFT_8 using w=4 as 8th root of unity
   // Unrolled decimation in time (DIT) radix-2 NTT.
   // Input data is in revbin_permuted order.

#define BUTTERFLY_0( i,j ) \
do { \
   __m256i u = X(j); \
   X(j) = _mm256_sub_epi16( X(j), X(i) ); \
   X(i) = _mm256_add_epi16( u, X(i) ); \
} while(0)


#define BUTTERFLY_N( i,j,n ) \
do { \
   __m256i u = X(j); \
   X(i) = _mm256_slli_epi16( X(i), w[n] ); \
   X(j) = _mm256_sub_epi16( X(j), X(i) ); \
   X(i) = _mm256_add_epi16( u, X(i) ); \
} while(0)

  DO_REDUCE( 0 );
  DO_REDUCE( 1 );
  DO_REDUCE( 2 );
  DO_REDUCE( 3 );
  DO_REDUCE( 4 );
  DO_REDUCE( 5 );
  DO_REDUCE( 6 );
  DO_REDUCE( 7 );

  BUTTERFLY_0( 0, 1 );
  BUTTERFLY_0( 2, 3 );
  BUTTERFLY_0( 4, 5 );
  BUTTERFLY_0( 6, 7 );

  BUTTERFLY_0( 0, 2 );
  BUTTERFLY_0( 4, 6 );
  BUTTERFLY_N( 1, 3, 2 );
  BUTTERFLY_N( 5, 7, 2 );

  DO_REDUCE( 3 );

  BUTTERFLY_0( 0, 4 );
  BUTTERFLY_N( 1, 5, 1 );
  BUTTERFLY_N( 2, 6, 2 );
  BUTTERFLY_N( 3, 7, 3 );

  DO_REDUCE_FULL_S( 0 );
  DO_REDUCE_FULL_S( 1 );
  DO_REDUCE_FULL_S( 2 );
  DO_REDUCE_FULL_S( 3 );
  DO_REDUCE_FULL_S( 4 );
  DO_REDUCE_FULL_S( 5 );
  DO_REDUCE_FULL_S( 6 );
  DO_REDUCE_FULL_S( 7 );

#undef BUTTERFLY

  A[0] = X0;
  A[1] = X1;
  A[2] = X2;
  A[3] = X3;
  A[4] = X4;
  A[5] = X5;
  A[6] = X6;
  A[7] = X7;

#undef X
}
コード例 #3
0
ファイル: tae.c プロジェクト: 0x64616E69656C/supercop
void write128_checksum2(v16qi *in128, uint8_t* out8, v16qi *cksum,
		       int mlen, int cklen) {

#define X(i) in128[8*(i%2)+(i/2)]
  
  // Transpose matrix
#define INTERLEAVE(i,j)					     \
  do {							     \
    v16qi t1= X(i);					     \
    v16qi t2= X(j);					     \
    X(i) = sse_interleavel(t1, t2);                          \
    X(j) = sse_interleaveh(t1, t2);                          \
  } while(0)
  
  INTERLEAVE( 0,  8);
  INTERLEAVE( 1,  9);
  INTERLEAVE( 2, 10);
  INTERLEAVE( 3, 11);
  INTERLEAVE( 4, 12);
  INTERLEAVE( 5, 13);
  INTERLEAVE( 6, 14);
  INTERLEAVE( 7, 15);

  INTERLEAVE( 0,  4);
  INTERLEAVE( 1,  5);
  INTERLEAVE( 2,  6);
  INTERLEAVE( 3,  7);
  INTERLEAVE( 8, 12);
  INTERLEAVE( 9, 13);
  INTERLEAVE(10, 14);
  INTERLEAVE(11, 15);

  INTERLEAVE( 0,  2);
  INTERLEAVE( 1,  3);
  INTERLEAVE( 4,  6);
  INTERLEAVE( 5,  7);
  INTERLEAVE( 8, 10);
  INTERLEAVE( 9, 11);
  INTERLEAVE(12, 14);
  INTERLEAVE(13, 15);

  INTERLEAVE( 0,  1);
  INTERLEAVE( 2,  3);
  INTERLEAVE( 4,  5);
  INTERLEAVE( 6,  7);
  INTERLEAVE( 8,  9);
  INTERLEAVE(10, 11);
  INTERLEAVE(12, 13);
  INTERLEAVE(14, 15);

  // Write data
  if (out8) {
    switch (mlen) {
    case 16:
    sse_store(out8+16*15, X(15));
    case 15:
    sse_store(out8+16*14, X(14));
    case 14:
    sse_store(out8+16*13, X(13));
    case 13:
    sse_store(out8+16*12, X(12));
    case 12:
    sse_store(out8+16*11, X(11));
    case 11:
    sse_store(out8+16*10, X(10));
    case 10:
    sse_store(out8+16*9 , X(9 ));
    case 9:
    sse_store(out8+16*8 , X(8 ));
    case 8:
    sse_store(out8+16*7 , X(7 ));
    case 7:
    sse_store(out8+16*6 , X(6 ));
    case 6:
    sse_store(out8+16*5 , X(5 ));
    case 5:
    sse_store(out8+16*4 , X(4 ));
    case 4:
    sse_store(out8+16*3 , X(3 ));
    case 3:
    sse_store(out8+16*2 , X(2 ));
    case 2:
    sse_store(out8+16*1 , X(1 ));
    case 1:
    sse_store(out8+16*0 , X(0 ));
    case 0:
      ;
    }
  }

  // Update checksum
  if (cksum) {
    switch (cklen) {
    case 16:
      *cksum ^= X(15);
    case 15:
      *cksum ^= X(14);
    case 14:
      *cksum ^= X(13);
    case 13:
      *cksum ^= X(12);
    case 12:
      *cksum ^= X(11);
    case 11:
      *cksum ^= X(10);
    case 10:
      *cksum ^= X(9 );
    case 9:
      *cksum ^= X(8 );
    case 8:
      *cksum ^= X(7 );
    case 7:
      *cksum ^= X(6 );
    case 6:
      *cksum ^= X(5 );
    case 5:
      *cksum ^= X(4 );
    case 4:
      *cksum ^= X(3 );
    case 3:
      *cksum ^= X(2 );
    case 2:
      *cksum ^= X(1 );
    case 1:
      *cksum ^= X(0 );
    case 0:
      ;
    }
  }
#undef X
#undef INTERLEAVE  
}
コード例 #4
0
ファイル: vector.c プロジェクト: cbouilla/spriiiiiiiing
void fft64(void *a) {

  v16* const A = a;

  register v16 X0, X1, X2, X3, X4, X5, X6, X7;

#define X(i) X##i

  X0 = A[0];
  X1 = A[1];
  X2 = A[2];
  X3 = A[3];
  X4 = A[4];
  X5 = A[5];
  X6 = A[6];
  X7 = A[7];

#define DO_REDUCE(i)                            \
  X(i) = REDUCE(X(i))

  /*
   * Begin with 8 parallels DIF FFT_8
   *
   * FFT_8 using w=4 as 8th root of unity
   *  Unrolled decimation in frequency (DIF) radix-2 NTT.
   *  Output data is in revbin_permuted order.
   */

  #define wn0 0
  #define wn1 2
  #define wn2 4
  #define wn3 6

#define BUTTERFLY(i,j,n)                                \
  do {                                                  \
    v16 u= X(i);                                        \
    v16 v= X(j);                                        \
    X(i) =  v16_add(u, v);                              \
    if (n)                                              \
      X(j) = v16_shift_l(v16_sub(u, v), XCAT(wn,n));	\
    else                                                \
      X(j) = v16_sub(u, v);                             \
  } while(0)

  BUTTERFLY(0, 4, 0);
  BUTTERFLY(1, 5, 1);
  BUTTERFLY(2, 6, 2);
  BUTTERFLY(3, 7, 3);
  
  DO_REDUCE(5);
  DO_REDUCE(6);
  DO_REDUCE(7);
  
  BUTTERFLY(0, 2, 0);
  BUTTERFLY(4, 6, 0);
  BUTTERFLY(1, 3, 2);
  BUTTERFLY(5, 7, 2);
  
  BUTTERFLY(0, 1, 0);
  BUTTERFLY(2, 3, 0);
  BUTTERFLY(4, 5, 0);
  BUTTERFLY(6, 7, 0);
  
  /* We don't need to reduce X(0) */
  DO_REDUCE_FULL_S(1);
  DO_REDUCE_FULL_S(2);
  DO_REDUCE_FULL_S(3);
  DO_REDUCE_FULL_S(4);
  DO_REDUCE_FULL_S(5);
  DO_REDUCE_FULL_S(6);
  DO_REDUCE_FULL_S(7);
    
#undef BUTTERFLY

  /*
   * Multiply by twiddle factors
   */

  X(1) = v16_mul(X(1), FFT64_Twiddle[0].v16);
  X(2) = v16_mul(X(2), FFT64_Twiddle[1].v16);
  X(3) = v16_mul(X(3), FFT64_Twiddle[2].v16);
  X(4) = v16_mul(X(4), FFT64_Twiddle[3].v16);
  X(5) = v16_mul(X(5), FFT64_Twiddle[4].v16);
  X(6) = v16_mul(X(6), FFT64_Twiddle[5].v16);
  X(7) = v16_mul(X(7), FFT64_Twiddle[6].v16);

  /*
   * Transpose the FFT state with a revbin order permutation
   * on the rows and the column.
   * This will make the full FFT_64 in order.
   */

#ifdef v16_interleave_inplace
#define INTERLEAVE(i,j) v16_interleave_inplace(X(i), X(j))
#else
#define INTERLEAVE(i,j)                          \
  do {                                           \
    v16 t1= X(i);                                \
    v16 t2= X(j);                                \
    X(i) = v16_interleavel(t1, t2);              \
    X(j) = v16_interleaveh(t1, t2);              \
  } while(0)
#endif

  INTERLEAVE(0, 1);
  INTERLEAVE(2, 3);
  INTERLEAVE(4, 5);
  INTERLEAVE(6, 7);

  INTERLEAVE(0, 2);
  INTERLEAVE(1, 3);
  INTERLEAVE(4, 6);
  INTERLEAVE(5, 7);

  INTERLEAVE(0, 4);
  INTERLEAVE(1, 5);
  INTERLEAVE(2, 6);
  INTERLEAVE(3, 7);

#undef INTERLEAVE

  /*
   * Finish with 8 parallels DIT FFT_8
   *
   * FFT_8 using w=4 as 8th root of unity
   *  Unrolled decimation in time (DIT) radix-2 NTT.
   *  Intput data is in revbin_permuted order.
   */
  
#define BUTTERFLY(i,j,n)                                \
  do {                                                  \
    v16 u= X(i);                                        \
    v16 v= X(j);                                        \
    if (n)                                              \
      v = v16_shift_l(v, XCAT(wn,n));			\
    X(i) = v16_add(u, v);                               \
    X(j) = v16_sub(u, v);                               \
  } while(0)

  DO_REDUCE(0);
  DO_REDUCE(1);
  DO_REDUCE(2);
  DO_REDUCE(3);
  DO_REDUCE(4);
  DO_REDUCE(5);
  DO_REDUCE(6);
  DO_REDUCE(7);
  
  BUTTERFLY(0, 1, 0);
  BUTTERFLY(2, 3, 0);
  BUTTERFLY(4, 5, 0);
  BUTTERFLY(6, 7, 0);
  
  BUTTERFLY(0, 2, 0);
  BUTTERFLY(4, 6, 0);
  BUTTERFLY(1, 3, 2);
  BUTTERFLY(5, 7, 2);
  
  DO_REDUCE(7);
  
  BUTTERFLY(0, 4, 0);
  BUTTERFLY(1, 5, 1);
  BUTTERFLY(2, 6, 2);
  BUTTERFLY(3, 7, 3);
  
  DO_REDUCE_FULL_S(0);
  DO_REDUCE_FULL_S(1);
  DO_REDUCE_FULL_S(2);
  DO_REDUCE_FULL_S(3);
  DO_REDUCE_FULL_S(4);
  DO_REDUCE_FULL_S(5);
  DO_REDUCE_FULL_S(6);
  DO_REDUCE_FULL_S(7);
  
#undef BUTTERFLY

  A[0] = X0;
  A[1] = X1;
  A[2] = X2;
  A[3] = X3;
  A[4] = X4;
  A[5] = X5;
  A[6] = X6;
  A[7] = X7;

#undef X

}