/*! * \brief Multiply the two given vectors of byte */ ETL_STATIC_INLINE(avx_simd_byte) mul(avx_simd_byte lhs, avx_simd_byte rhs) { auto aodd = _mm256_srli_epi16(lhs.value, 8); auto bodd = _mm256_srli_epi16(rhs.value, 8); auto muleven = _mm256_mullo_epi16(lhs.value, rhs.value); auto mulodd = _mm256_slli_epi16(_mm256_mullo_epi16(aodd, bodd), 8); return _mm256_blendv_epi8(mulodd, muleven, _mm256_set1_epi32(0x00FF00FF)); }
template <bool align> SIMD_INLINE void VectorProduct(const __m256i & vertical, const uint8_t * horizontal, uint8_t * dst) { __m256i _horizontal = Load<align>((__m256i*)horizontal); __m256i lo = DivideI16By255(_mm256_mullo_epi16(vertical, _mm256_unpacklo_epi8(_horizontal, K_ZERO))); __m256i hi = DivideI16By255(_mm256_mullo_epi16(vertical, _mm256_unpackhi_epi8(_horizontal, K_ZERO))); Store<align>((__m256i*)dst, _mm256_packus_epi16(lo, hi)); }
SIMD_INLINE void MaskSrc(const uint8_t * src, const uint8_t * mask, const __m256i & index, ptrdiff_t offset, uint16_t * dst) { const __m256i _src = Load<srcAlign>((__m256i*)(src + offset)); const __m256i _mask = _mm256_and_si256(_mm256_cmpeq_epi8(Load<srcAlign>((__m256i*)(mask + offset)), index), K8_01); __m256i lo = _mm256_mullo_epi16(_mm256_add_epi16(K16_0008, UnpackU8<0>(_src)), UnpackU8<0>(_mask)); __m256i hi = _mm256_mullo_epi16(_mm256_add_epi16(K16_0008, UnpackU8<1>(_src)), UnpackU8<1>(_mask)); Store<dstAlign>((__m256i*)(dst + offset) + 0, _mm256_permute2x128_si256(lo, hi, 0x20)); Store<dstAlign>((__m256i*)(dst + offset) + 1, _mm256_permute2x128_si256(lo, hi, 0x31)); }
template <bool align, bool compensation> SIMD_INLINE __m256i MainRowX5x5(uint16_t * dst) { __m256i t0 = _mm256_loadu_si256((__m256i*)(dst - 2)); __m256i t1 = _mm256_loadu_si256((__m256i*)(dst - 1)); __m256i t2 = Load<align>((__m256i*)dst); __m256i t3 = _mm256_loadu_si256((__m256i*)(dst + 1)); __m256i t4 = _mm256_loadu_si256((__m256i*)(dst + 2)); t2 = _mm256_add_epi16(_mm256_add_epi16(_mm256_mullo_epi16(t2, K16_0006), _mm256_mullo_epi16(_mm256_add_epi16(t1, t3), K16_0004)), _mm256_add_epi16(t0, t4)); return DivideBy256<compensation>(t2); }
void fft128_2way( void *a ) { int i; // Temp space to help for interleaving in the end __m256i B[8]; __m256i *A = (__m256i*) a; // __m256i *Twiddle = (__m256i*)FFT128_Twiddle; /* Size-2 butterflies */ for ( i = 0; i<8; i++ ) { B[ i ] = _mm256_add_epi16( A[ i ], A[ i+8 ] ); B[ i ] = REDUCE_FULL_S( B[ i ] ); A[ i+8 ] = _mm256_sub_epi16( A[ i ], A[ i+8 ] ); A[ i+8 ] = REDUCE_FULL_S( A[ i+8 ] ); A[ i+8 ] = _mm256_mullo_epi16( A[ i+8 ], FFT128_Twiddle[i].m256i ); A[ i+8 ] = REDUCE_FULL_S( A[ i+8 ] ); } fft64_2way( B ); fft64_2way( A+8 ); /* Transpose (i.e. interleave) */ for ( i = 0; i < 8; i++ ) { A[ 2*i ] = _mm256_unpacklo_epi16( B[ i ], A[ i+8 ] ); A[ 2*i+1 ] = _mm256_unpackhi_epi16( B[ i ], A[ i+8 ] ); } }
void maddrc16_imul_avx2(uint8_t* region1, const uint8_t* region2, uint8_t constant, size_t length) { uint8_t *end; register __m256i reg1, reg2, ri[4], sp[4], mi[4]; const uint8_t *p = pt[constant]; if (constant == 0) return; if (constant == 1) { xorr_avx2(region1, region2, length); return; } mi[0] = _mm256_set1_epi8(0x11); mi[1] = _mm256_set1_epi8(0x22); mi[2] = _mm256_set1_epi8(0x44); mi[3] = _mm256_set1_epi8(0x88); sp[0] = _mm256_set1_epi16(p[0]); sp[1] = _mm256_set1_epi16(p[1]); sp[2] = _mm256_set1_epi16(p[2]); sp[3] = _mm256_set1_epi16(p[3]); for (end=region1+length; region1<end; region1+=32, region2+=32) { reg2 = _mm256_load_si256((void *)region2); reg1 = _mm256_load_si256((void *)region1); ri[0] = _mm256_and_si256(reg2, mi[0]); ri[1] = _mm256_and_si256(reg2, mi[1]); ri[2] = _mm256_and_si256(reg2, mi[2]); ri[3] = _mm256_and_si256(reg2, mi[3]); ri[1] = _mm256_srli_epi16(ri[1], 1); ri[2] = _mm256_srli_epi16(ri[2], 2); ri[3] = _mm256_srli_epi16(ri[3], 3); ri[0] = _mm256_mullo_epi16(ri[0], sp[0]); ri[1] = _mm256_mullo_epi16(ri[1], sp[1]); ri[2] = _mm256_mullo_epi16(ri[2], sp[2]); ri[3] = _mm256_mullo_epi16(ri[3], sp[3]); ri[0] = _mm256_xor_si256(ri[0], ri[1]); ri[2] = _mm256_xor_si256(ri[2], ri[3]); ri[0] = _mm256_xor_si256(ri[0], ri[2]); ri[0] = _mm256_xor_si256(ri[0], reg1); _mm256_store_si256((void *)region1, ri[0]); } }
template<bool align> SIMD_INLINE void MainRowY5x5(__m256i odd, __m256i even, Buffer & buffer, size_t offset) { __m256i cp = _mm256_mullo_epi16(odd, K16_0004); __m256i c0 = Load<align>((__m256i*)(buffer.in0 + offset)); __m256i c1 = Load<align>((__m256i*)(buffer.in1 + offset)); Store<align>((__m256i*)(buffer.dst + offset), _mm256_add_epi16(even, _mm256_add_epi16(c1, _mm256_add_epi16(cp, _mm256_mullo_epi16(c0, K16_0006))))); Store<align>((__m256i*)(buffer.out1 + offset), _mm256_add_epi16(c0, cp)); Store<align>((__m256i*)(buffer.out0 + offset), even); }
int main() { const ssize_t A = 3; const size_t Awidth = 2; const size_t Dwidth = 4; const ssize_t Dmin = (-1) * (1ll << (Dwidth - 1)); const ssize_t Dmax = (1ll << (Dwidth - 1)) - 1; const ssize_t Cwidth = Awidth + Dwidth; const ssize_t AInv = ext_euklidean(A, Cwidth) & ((1ll << Cwidth) - 1); const size_t numCodewords = (1ull << Cwidth); std::cout << "numCodewords: " << numCodewords << std::endl; const size_t numMasks = numCodewords / (sizeof(int) * 4); // How many masks will we generate? int * pNonCodewordMasks = new int[numMasks]; const int16_t c = ~((1ll << (Cwidth - 1)) - 1); std::cout << "c = 0x" << std::hex << c << std::dec << std::endl; for (ssize_t i = 0, cw = c, posMask = 0; i < numCodewords; ++posMask) { int tmpMask = 0; for (ssize_t k = 0; k < 16; ++k, ++cw, ++i) { if ((cw % A) != 0) { // we want the non-codewords // std::cout << "cw % A != 0: " << cw << std::endl; tmpMask |= (1ll << (k * 2)) | (1ll << (k * 2 + 1)); // expand to 32 bits, because AVX2 cannot movemask across lanes to 16 bits } } pNonCodewordMasks[posMask] = tmpMask; } std::cout << "numMasks: " << numMasks << std::endl; std::cout << "non-codeword-masks: 0x" << std::hex << std::setfill('0'); for (size_t posMask = 0; posMask < numMasks; ++posMask) { std::cout << std::setw(8) << pNonCodewordMasks[posMask] << ':'; } std::cout << std::dec << std::endl << std::setfill(' '); auto mmCodewords = _mm256_set_epi16(c+15, c+14, c+13, c+12, c+11, c+10, c+9, c+8, c+7, c+6, c+5, c+4, c+3, c+2, c+1, c); auto mmAddUp = _mm256_set1_epi16(16); auto mmAinv = _mm256_set1_epi16(AInv); auto mmDmin = _mm256_set1_epi16(Dmin); auto mmDmax = _mm256_set1_epi16(Dmax); const size_t posEnd = (1ull << Cwidth); __m256i mmFillUp[] = {_mm256_set1_epi16(0), _mm256_set1_epi16(~((1ll << Cwidth) - 1))}; // fill up all non-codeword bits with 1's if necessary std::cout << "posEnd = 0x" << std::hex << posEnd << std::dec << std::endl; std::cout << std::setfill('0') << std::hex; for(size_t pos = 15, posMask = 0; pos < posEnd; pos += 16, ++posMask) { auto isNeg = 0x1 & _mm256_movemask_epi8(_mm256_cmpgt_epi16(mmFillUp[0], mmCodewords)); auto mm1 = _mm256_or_si256(_mm256_mullo_epi16(mmCodewords, mmAinv), mmFillUp[isNeg]); auto mm2 = _mm256_cmpgt_epi16(mm1, mmDmin); auto mm3 = _mm256_cmpgt_epi16(mmDmax, mm1); auto mm4 = _mm256_cmpeq_epi16(mmDmax, mm1); auto mm5 = _mm256_or_si256(mm3, mm4); auto mm6 = _mm256_and_si256(mm2, mm5); auto mask = _mm256_movemask_epi8(mm6); if (mask & pNonCodewordMasks[posMask]) { std::cout << "BAD @0x" << std::setw((Cwidth + 7) / 8) << pos << ": 0x" << mask << " & 0x" << pNonCodewordMasks[posMask] << " = 0x" << (mask & pNonCodewordMasks[posMask]) << std::endl; } else { std::cout << "OK @0x" << std::setw((Cwidth + 7) / 8) << pos << ": 0x" << mask << " & 0x" << pNonCodewordMasks[posMask] << " = 0x" << (mask & pNonCodewordMasks[posMask]) << std::endl; } mmCodewords = _mm256_add_epi16(mmCodewords, mmAddUp); } std::cout << std::setfill(' ') << std::dec; }
template<bool align> SIMD_INLINE void FirstRow5x5(__m256i src, Buffer & buffer, size_t offset) { Store<align>((__m256i*)(buffer.in0 + offset), src); Store<align>((__m256i*)(buffer.in1 + offset), _mm256_mullo_epi16(src, K16_0005)); }
__m256i test_mm256_mullo_epi16(__m256i a, __m256i b) { // CHECK: mul <16 x i16> return _mm256_mullo_epi16(a, b); }
/*! * \brief Multiply the two given vectors of short */ ETL_STATIC_INLINE(avx_simd_short) mul(avx_simd_short lhs, avx_simd_short rhs) { return _mm256_mullo_epi16(lhs.value, rhs.value); }
static FORCE_INLINE void FlowInterExtra_8px_AVX2( int w, PixelType *pdst, const PixelType *prefB, const PixelType *prefF, const int16_t *VXFullB, const int16_t *VXFullF, const int16_t *VYFullB, const int16_t *VYFullF, const uint8_t *MaskB, const uint8_t *MaskF, int nPelLog, const int16_t *VXFullBB, const int16_t *VXFullFF, const int16_t *VYFullBB, const int16_t *VYFullFF, const __m256i &dwords_time256, const __m256i &dwords_256_time256, const __m256i &dwords_ref_pitch, const __m256i &dwords_hoffsets) { __m256i dwords_w = _mm256_add_epi32(_mm256_set1_epi32(w << nPelLog), dwords_hoffsets); __m256i dstF = lookup_AVX2(VXFullF, VYFullF, prefF, w, dwords_time256, dwords_ref_pitch, dwords_w); __m256i dstB = lookup_AVX2(VXFullB, VYFullB, prefB, w, dwords_256_time256, dwords_ref_pitch, dwords_w); __m256i dstFF = lookup_AVX2(VXFullFF, VYFullFF, prefF, w, dwords_time256, dwords_ref_pitch, dwords_w); __m256i dstBB = lookup_AVX2(VXFullBB, VYFullBB, prefB, w, dwords_256_time256, dwords_ref_pitch, dwords_w); __m256i minfb = mm256_min_epu<PixelType>(dstF, dstB); __m256i maxfb = mm256_max_epu<PixelType>(dstF, dstB); __m256i medianBB = mm256_max_epu<PixelType>(minfb, mm256_min_epu<PixelType>(maxfb, dstBB)); __m256i medianFF = mm256_max_epu<PixelType>(minfb, mm256_min_epu<PixelType>(maxfb, dstFF)); __m256i maskf = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskF[w])); __m256i maskb = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskB[w])); const __m256i dwords_255 = _mm256_set1_epi32(255); __m256i maskf_inv = _mm256_sub_epi32(dwords_255, maskf); __m256i maskb_inv = _mm256_sub_epi32(dwords_255, maskb); if (sizeof(PixelType) == 1) { dstF = _mm256_mullo_epi16(dstF, maskf_inv); dstB = _mm256_mullo_epi16(dstB, maskb_inv); medianBB = _mm256_mullo_epi16(medianBB, maskf); medianFF = _mm256_mullo_epi16(medianFF, maskb); } else { dstF = _mm256_mullo_epi32(dstF, maskf_inv); dstB = _mm256_mullo_epi32(dstB, maskb_inv); medianBB = _mm256_mullo_epi32(medianBB, maskf); medianFF = _mm256_mullo_epi32(medianFF, maskb); } dstF = _mm256_add_epi32(dstF, dwords_255); dstB = _mm256_add_epi32(dstB, dwords_255); dstF = _mm256_add_epi32(dstF, medianBB); dstB = _mm256_add_epi32(dstB, medianFF); dstF = _mm256_srai_epi32(dstF, 8); dstB = _mm256_srai_epi32(dstB, 8); if (sizeof(PixelType) == 2) { dstF = _mm256_sub_epi16(dstF, _mm256_set1_epi32(32768)); dstB = _mm256_sub_epi16(dstB, _mm256_set1_epi32(32768)); } dstF = _mm256_madd_epi16(dstF, dwords_256_time256); dstB = _mm256_madd_epi16(dstB, dwords_time256); if (sizeof(PixelType) == 2) { // dstF = _mm256_add_epi32(dstF, _mm256_slli_epi32(dwords_256_time256, 15)); // dstB = _mm256_add_epi32(dstB, _mm256_slli_epi32(dwords_time256, 15)); // Knowing that they add up to 256, the two additions can be combined. dstF = _mm256_add_epi32(dstF, _mm256_set1_epi32(256 << 15)); } __m256i dst = _mm256_add_epi32(dstF, dstB); dst = _mm256_srai_epi32(dst, 8); dst = _mm256_packus_epi32(dst, dst); dst = _mm256_permute4x64_epi64(dst, 0xe8); // 0b11101000 - copy third qword to second qword __m128i dst128 = _mm256_castsi256_si128(dst); if (sizeof(PixelType) == 1) { dst128 = _mm_packus_epi16(dst128, dst128); _mm_storel_epi64((__m128i *)&pdst[w], dst128); } else { _mm_storeu_si128((__m128i *)&pdst[w], dst128); } }
static FORCE_INLINE void FlowInter_8px_AVX2( int w, PixelType *pdst, const PixelType *prefB, const PixelType *prefF, const int16_t *VXFullB, const int16_t *VXFullF, const int16_t *VYFullB, const int16_t *VYFullF, const uint8_t *MaskB, const uint8_t *MaskF, int nPelLog, const __m256i &dwords_time256, const __m256i &dwords_256_time256, const __m256i &dwords_ref_pitch, const __m256i &dwords_hoffsets) { __m256i dwords_w = _mm256_add_epi32(_mm256_set1_epi32(w << nPelLog), dwords_hoffsets); __m256i dstF = lookup_AVX2(VXFullF, VYFullF, prefF, w, dwords_time256, dwords_ref_pitch, dwords_w); __m256i dstB = lookup_AVX2(VXFullB, VYFullB, prefB, w, dwords_256_time256, dwords_ref_pitch, dwords_w); __m256i dstF0 = _mm256_i32gather_epi32((const int *)prefF, dwords_w, sizeof(PixelType)); __m256i dstB0 = _mm256_i32gather_epi32((const int *)prefB, dwords_w, sizeof(PixelType)); dstF0 = _mm256_and_si256(dstF0, _mm256_set1_epi32((1 << (sizeof(PixelType) * 8)) - 1)); dstB0 = _mm256_and_si256(dstB0, _mm256_set1_epi32((1 << (sizeof(PixelType) * 8)) - 1)); __m256i maskf = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskF[w])); __m256i maskb = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskB[w])); const __m256i dwords_255 = _mm256_set1_epi32(255); __m256i maskf_inv = _mm256_sub_epi32(dwords_255, maskf); __m256i maskb_inv = _mm256_sub_epi32(dwords_255, maskb); __m256i dstF_maskf_inv, dstB_maskb_inv, dstF0_maskb, dstB0_maskf; if (sizeof(PixelType) == 1) { dstF_maskf_inv = _mm256_mullo_epi16(dstF, maskf_inv); dstB_maskb_inv = _mm256_mullo_epi16(dstB, maskb_inv); dstF0_maskb = _mm256_mullo_epi16(dstF0, maskb); dstB0_maskf = _mm256_mullo_epi16(dstB0, maskf); } else { dstF_maskf_inv = _mm256_mullo_epi32(dstF, maskf_inv); dstB_maskb_inv = _mm256_mullo_epi32(dstB, maskb_inv); dstF0_maskb = _mm256_mullo_epi32(dstF0, maskb); dstB0_maskf = _mm256_mullo_epi32(dstB0, maskf); } __m256i f = _mm256_add_epi32(dstF0_maskb, dstB_maskb_inv); __m256i b = _mm256_add_epi32(dstB0_maskf, dstF_maskf_inv); if (sizeof(PixelType) == 1) { f = _mm256_mullo_epi32(f, maskf); b = _mm256_mullo_epi32(b, maskb); f = _mm256_add_epi32(f, dwords_255); b = _mm256_add_epi32(b, dwords_255); f = _mm256_srai_epi32(f, 8); b = _mm256_srai_epi32(b, 8); } else { const __m256i qwords_255 = _mm256_set1_epi64x(255); __m256i tempf = _mm256_mul_epu32(f, maskf); __m256i tempb = _mm256_mul_epu32(b, maskb); tempf = _mm256_add_epi64(tempf, qwords_255); tempb = _mm256_add_epi64(tempb, qwords_255); tempf = _mm256_srli_epi64(tempf, 8); tempb = _mm256_srli_epi64(tempb, 8); f = _mm256_srli_epi64(f, 32); b = _mm256_srli_epi64(b, 32); f = _mm256_mul_epu32(f, _mm256_srli_epi64(maskf, 32)); b = _mm256_mul_epu32(b, _mm256_srli_epi64(maskb, 32)); f = _mm256_add_epi64(f, qwords_255); b = _mm256_add_epi64(b, qwords_255); f = _mm256_srli_epi64(f, 8); b = _mm256_srli_epi64(b, 8); f = _mm256_or_si256(tempf, _mm256_slli_epi64(f, 32)); b = _mm256_or_si256(tempb, _mm256_slli_epi64(b, 32)); } f = _mm256_add_epi32(f, dstF_maskf_inv); b = _mm256_add_epi32(b, dstB_maskb_inv); f = _mm256_add_epi32(f, dwords_255); b = _mm256_add_epi32(b, dwords_255); f = _mm256_srai_epi32(f, 8); b = _mm256_srai_epi32(b, 8); if (sizeof(PixelType) == 1) { f = _mm256_madd_epi16(f, dwords_256_time256); b = _mm256_madd_epi16(b, dwords_time256); } else { f = _mm256_mullo_epi32(f, dwords_256_time256); b = _mm256_mullo_epi32(b, dwords_time256); } __m256i dst = _mm256_add_epi32(f, b); dst = _mm256_srai_epi32(dst, 8); dst = _mm256_packus_epi32(dst, dst); dst = _mm256_permute4x64_epi64(dst, 0xe8); // 0b11101000 - copy third qword to second qword __m128i dst128 = _mm256_castsi256_si128(dst); if (sizeof(PixelType) == 1) { dst128 = _mm_packus_epi16(dst128, dst128); _mm_storel_epi64((__m128i *)&pdst[w], dst128); } else { _mm_storeu_si128((__m128i *)&pdst[w], dst128); } }
template<bool align> SIMD_INLINE __m256i InterpolateY(const __m256i * pbx0, const __m256i * pbx1, __m256i alpha[2]) { __m256i sum = _mm256_add_epi16(_mm256_mullo_epi16(Load<align>(pbx0), alpha[0]), _mm256_mullo_epi16(Load<align>(pbx1), alpha[1])); return _mm256_srli_epi16(_mm256_add_epi16(sum, K16_FRACTION_ROUND_TERM), Base::BILINEAR_SHIFT); }
void fft64_2way( void *a ) { __m256i* const A = a; register __m256i X0, X1, X2, X3, X4, X5, X6, X7; #define X(i) X##i X0 = A[0]; X1 = A[1]; X2 = A[2]; X3 = A[3]; X4 = A[4]; X5 = A[5]; X6 = A[6]; X7 = A[7]; #define DO_REDUCE(i) X(i) = REDUCE( X(i) ) // Begin with 8 parallels DIF FFT_8 // // FFT_8 using w=4 as 8th root of unity // Unrolled decimation in frequency (DIF) radix-2 NTT. // Output data is in revbin_permuted order. static const int w[] = {0, 2, 4, 6}; // __m256i *Twiddle = (__m256i*)FFT64_Twiddle; #define BUTTERFLY_0( i,j ) \ do { \ __m256i v = X(j); \ X(j) = _mm256_add_epi16( X(i), X(j) ); \ X(i) = _mm256_sub_epi16( X(i), v ); \ } while(0) #define BUTTERFLY_N( i,j,n ) \ do { \ __m256i v = X(j); \ X(j) = _mm256_add_epi16( X(i), X(j) ); \ X(i) = _mm256_slli_epi16( _mm256_sub_epi16( X(i), v ), w[n] ); \ } while(0) BUTTERFLY_0( 0, 4 ); BUTTERFLY_N( 1, 5, 1 ); BUTTERFLY_N( 2, 6, 2 ); BUTTERFLY_N( 3, 7, 3 ); DO_REDUCE( 2 ); DO_REDUCE( 3 ); BUTTERFLY_0( 0, 2 ); BUTTERFLY_0( 4, 6 ); BUTTERFLY_N( 1, 3, 2 ); BUTTERFLY_N( 5, 7, 2 ); DO_REDUCE( 1 ); BUTTERFLY_0( 0, 1 ); BUTTERFLY_0( 2, 3 ); BUTTERFLY_0( 4, 5 ); BUTTERFLY_0( 6, 7 ); /* We don't need to reduce X(7) */ DO_REDUCE_FULL_S( 0 ); DO_REDUCE_FULL_S( 1 ); DO_REDUCE_FULL_S( 2 ); DO_REDUCE_FULL_S( 3 ); DO_REDUCE_FULL_S( 4 ); DO_REDUCE_FULL_S( 5 ); DO_REDUCE_FULL_S( 6 ); #undef BUTTERFLY_0 #undef BUTTERFLY_N // Multiply by twiddle factors X(6) = _mm256_mullo_epi16( X(6), FFT64_Twiddle[0].m256i ); X(5) = _mm256_mullo_epi16( X(5), FFT64_Twiddle[1].m256i ); X(4) = _mm256_mullo_epi16( X(4), FFT64_Twiddle[2].m256i ); X(3) = _mm256_mullo_epi16( X(3), FFT64_Twiddle[3].m256i ); X(2) = _mm256_mullo_epi16( X(2), FFT64_Twiddle[4].m256i ); X(1) = _mm256_mullo_epi16( X(1), FFT64_Twiddle[5].m256i ); X(0) = _mm256_mullo_epi16( X(0), FFT64_Twiddle[6].m256i ); // Transpose the FFT state with a revbin order permutation // on the rows and the column. // This will make the full FFT_64 in order. #define INTERLEAVE(i,j) \ do { \ __m256i t1= X(i); \ __m256i t2= X(j); \ X(i) = _mm256_unpacklo_epi16( t1, t2 ); \ X(j) = _mm256_unpackhi_epi16( t1, t2 ); \ } while(0) INTERLEAVE( 1, 0 ); INTERLEAVE( 3, 2 ); INTERLEAVE( 5, 4 ); INTERLEAVE( 7, 6 ); INTERLEAVE( 2, 0 ); INTERLEAVE( 3, 1 ); INTERLEAVE( 6, 4 ); INTERLEAVE( 7, 5 ); INTERLEAVE( 4, 0 ); INTERLEAVE( 5, 1 ); INTERLEAVE( 6, 2 ); INTERLEAVE( 7, 3 ); #undef INTERLEAVE //Finish with 8 parallels DIT FFT_8 //FFT_8 using w=4 as 8th root of unity // Unrolled decimation in time (DIT) radix-2 NTT. // Input data is in revbin_permuted order. #define BUTTERFLY_0( i,j ) \ do { \ __m256i u = X(j); \ X(j) = _mm256_sub_epi16( X(j), X(i) ); \ X(i) = _mm256_add_epi16( u, X(i) ); \ } while(0) #define BUTTERFLY_N( i,j,n ) \ do { \ __m256i u = X(j); \ X(i) = _mm256_slli_epi16( X(i), w[n] ); \ X(j) = _mm256_sub_epi16( X(j), X(i) ); \ X(i) = _mm256_add_epi16( u, X(i) ); \ } while(0) DO_REDUCE( 0 ); DO_REDUCE( 1 ); DO_REDUCE( 2 ); DO_REDUCE( 3 ); DO_REDUCE( 4 ); DO_REDUCE( 5 ); DO_REDUCE( 6 ); DO_REDUCE( 7 ); BUTTERFLY_0( 0, 1 ); BUTTERFLY_0( 2, 3 ); BUTTERFLY_0( 4, 5 ); BUTTERFLY_0( 6, 7 ); BUTTERFLY_0( 0, 2 ); BUTTERFLY_0( 4, 6 ); BUTTERFLY_N( 1, 3, 2 ); BUTTERFLY_N( 5, 7, 2 ); DO_REDUCE( 3 ); BUTTERFLY_0( 0, 4 ); BUTTERFLY_N( 1, 5, 1 ); BUTTERFLY_N( 2, 6, 2 ); BUTTERFLY_N( 3, 7, 3 ); DO_REDUCE_FULL_S( 0 ); DO_REDUCE_FULL_S( 1 ); DO_REDUCE_FULL_S( 2 ); DO_REDUCE_FULL_S( 3 ); DO_REDUCE_FULL_S( 4 ); DO_REDUCE_FULL_S( 5 ); DO_REDUCE_FULL_S( 6 ); DO_REDUCE_FULL_S( 7 ); #undef BUTTERFLY A[0] = X0; A[1] = X1; A[2] = X2; A[3] = X3; A[4] = X4; A[5] = X5; A[6] = X6; A[7] = X7; #undef X }
SIMD_INLINE __m256i BinomialSum16(const __m256i & a, const __m256i & b, const __m256i & c, const __m256i & d) { return _mm256_add_epi16(_mm256_add_epi16(a, d), _mm256_mullo_epi16(_mm256_add_epi16(b, c), K16_0003)); }