Beispiel #1
0
 /*!
  * \brief Multiply the two given vectors of byte
  */
 ETL_STATIC_INLINE(avx_simd_byte) mul(avx_simd_byte lhs, avx_simd_byte rhs) {
     auto aodd    = _mm256_srli_epi16(lhs.value, 8);
     auto bodd    = _mm256_srli_epi16(rhs.value, 8);
     auto muleven = _mm256_mullo_epi16(lhs.value, rhs.value);
     auto mulodd  = _mm256_slli_epi16(_mm256_mullo_epi16(aodd, bodd), 8);
     return _mm256_blendv_epi8(mulodd, muleven, _mm256_set1_epi32(0x00FF00FF));
 }
Beispiel #2
0
 template <bool align> SIMD_INLINE void VectorProduct(const __m256i & vertical, const uint8_t * horizontal, uint8_t * dst)
 {
     __m256i _horizontal = Load<align>((__m256i*)horizontal);
     __m256i lo = DivideI16By255(_mm256_mullo_epi16(vertical, _mm256_unpacklo_epi8(_horizontal, K_ZERO)));
     __m256i hi = DivideI16By255(_mm256_mullo_epi16(vertical, _mm256_unpackhi_epi8(_horizontal, K_ZERO)));
     Store<align>((__m256i*)dst, _mm256_packus_epi16(lo, hi));
 } 
Beispiel #3
0
 SIMD_INLINE void MaskSrc(const uint8_t * src, const uint8_t * mask, const __m256i & index, ptrdiff_t offset, uint16_t * dst)
 {
     const __m256i _src = Load<srcAlign>((__m256i*)(src + offset));
     const __m256i _mask = _mm256_and_si256(_mm256_cmpeq_epi8(Load<srcAlign>((__m256i*)(mask + offset)), index), K8_01);
     __m256i lo = _mm256_mullo_epi16(_mm256_add_epi16(K16_0008, UnpackU8<0>(_src)), UnpackU8<0>(_mask));
     __m256i hi = _mm256_mullo_epi16(_mm256_add_epi16(K16_0008, UnpackU8<1>(_src)), UnpackU8<1>(_mask));
     Store<dstAlign>((__m256i*)(dst + offset) + 0, _mm256_permute2x128_si256(lo, hi, 0x20)); 
     Store<dstAlign>((__m256i*)(dst + offset) + 1, _mm256_permute2x128_si256(lo, hi, 0x31));
 }
 template <bool align, bool compensation> SIMD_INLINE __m256i MainRowX5x5(uint16_t * dst)
 {
     __m256i t0 = _mm256_loadu_si256((__m256i*)(dst - 2));
     __m256i t1 = _mm256_loadu_si256((__m256i*)(dst - 1));
     __m256i t2 = Load<align>((__m256i*)dst);
     __m256i t3 = _mm256_loadu_si256((__m256i*)(dst + 1));
     __m256i t4 = _mm256_loadu_si256((__m256i*)(dst + 2));
     t2 = _mm256_add_epi16(_mm256_add_epi16(_mm256_mullo_epi16(t2, K16_0006), _mm256_mullo_epi16(_mm256_add_epi16(t1, t3), K16_0004)), _mm256_add_epi16(t0, t4));
     return DivideBy256<compensation>(t2);
 }
Beispiel #5
0
void fft128_2way( void *a )
{
  int i;
  // Temp space to help for interleaving in the end
  __m256i B[8];
  __m256i *A = (__m256i*) a;
//  __m256i *Twiddle = (__m256i*)FFT128_Twiddle;

  /* Size-2 butterflies */
  for ( i = 0; i<8; i++ )
  {
    B[ i ]   = _mm256_add_epi16( A[ i ], A[ i+8 ] );
    B[ i ]   = REDUCE_FULL_S( B[ i ] );
    A[ i+8 ] = _mm256_sub_epi16( A[ i ], A[ i+8 ] );
    A[ i+8 ] = REDUCE_FULL_S( A[ i+8 ] );
    A[ i+8 ] = _mm256_mullo_epi16( A[ i+8 ], FFT128_Twiddle[i].m256i );
    A[ i+8 ] = REDUCE_FULL_S( A[ i+8 ] );
  }

  fft64_2way( B );
  fft64_2way( A+8 );

  /* Transpose (i.e. interleave) */
  for ( i = 0; i < 8; i++ )
  {
    A[ 2*i   ] = _mm256_unpacklo_epi16( B[ i ], A[ i+8 ] );
    A[ 2*i+1 ] = _mm256_unpackhi_epi16( B[ i ], A[ i+8 ] );
  }
}
Beispiel #6
0
void
maddrc16_imul_avx2(uint8_t* region1, const uint8_t* region2,
					uint8_t constant, size_t length)
{
	uint8_t *end;
	register __m256i reg1, reg2, ri[4], sp[4], mi[4];
	const uint8_t *p = pt[constant];

	if (constant == 0)
		return;

	if (constant == 1) {
		xorr_avx2(region1, region2, length);
		return;
	}
	
	mi[0] = _mm256_set1_epi8(0x11);
	mi[1] = _mm256_set1_epi8(0x22);
	mi[2] = _mm256_set1_epi8(0x44);
	mi[3] = _mm256_set1_epi8(0x88);
	sp[0] = _mm256_set1_epi16(p[0]);
	sp[1] = _mm256_set1_epi16(p[1]);
	sp[2] = _mm256_set1_epi16(p[2]);
	sp[3] = _mm256_set1_epi16(p[3]);

	for (end=region1+length; region1<end; region1+=32, region2+=32) {
		reg2 = _mm256_load_si256((void *)region2);
		reg1 = _mm256_load_si256((void *)region1);
		ri[0] = _mm256_and_si256(reg2, mi[0]);
		ri[1] = _mm256_and_si256(reg2, mi[1]);
		ri[2] = _mm256_and_si256(reg2, mi[2]);
		ri[3] = _mm256_and_si256(reg2, mi[3]);
		ri[1] = _mm256_srli_epi16(ri[1], 1);
		ri[2] = _mm256_srli_epi16(ri[2], 2);
		ri[3] = _mm256_srli_epi16(ri[3], 3);
		ri[0] = _mm256_mullo_epi16(ri[0], sp[0]);
		ri[1] = _mm256_mullo_epi16(ri[1], sp[1]);
		ri[2] = _mm256_mullo_epi16(ri[2], sp[2]);
		ri[3] = _mm256_mullo_epi16(ri[3], sp[3]);
		ri[0] = _mm256_xor_si256(ri[0], ri[1]);
		ri[2] = _mm256_xor_si256(ri[2], ri[3]);
		ri[0] = _mm256_xor_si256(ri[0], ri[2]);
		ri[0] = _mm256_xor_si256(ri[0], reg1);
		_mm256_store_si256((void *)region1, ri[0]);
	}
}
 template<bool align> SIMD_INLINE void MainRowY5x5(__m256i odd, __m256i even, Buffer & buffer, size_t offset)
 {
     __m256i cp = _mm256_mullo_epi16(odd, K16_0004);
     __m256i c0 = Load<align>((__m256i*)(buffer.in0 + offset));
     __m256i c1 = Load<align>((__m256i*)(buffer.in1 + offset));
     Store<align>((__m256i*)(buffer.dst + offset), _mm256_add_epi16(even, _mm256_add_epi16(c1, _mm256_add_epi16(cp, _mm256_mullo_epi16(c0, K16_0006)))));
     Store<align>((__m256i*)(buffer.out1 + offset), _mm256_add_epi16(c0, cp));
     Store<align>((__m256i*)(buffer.out0 + offset), even);
 }
Beispiel #8
0
int main() {
	const ssize_t A = 3;
	const size_t Awidth = 2;
	const size_t Dwidth = 4;
	const ssize_t Dmin = (-1) * (1ll << (Dwidth - 1));
	const ssize_t Dmax = (1ll << (Dwidth - 1)) - 1;
	const ssize_t Cwidth = Awidth + Dwidth;
	const ssize_t AInv = ext_euklidean(A, Cwidth) & ((1ll << Cwidth) - 1);
	const size_t numCodewords = (1ull << Cwidth);
	std::cout << "numCodewords: " << numCodewords << std::endl;
	const size_t numMasks = numCodewords / (sizeof(int) * 4); // How many masks will we generate?
	int * pNonCodewordMasks = new int[numMasks];
	const int16_t c = ~((1ll << (Cwidth - 1)) - 1);
	std::cout << "c = 0x" << std::hex << c << std::dec << std::endl;
	for (ssize_t i = 0, cw = c, posMask = 0; i < numCodewords; ++posMask) {
		int tmpMask = 0;
		for (ssize_t k = 0; k < 16; ++k, ++cw, ++i) {
			if ((cw % A) != 0) { // we want the non-codewords
				// std::cout << "cw % A != 0: " << cw << std::endl;
				tmpMask |= (1ll << (k * 2)) | (1ll << (k * 2 + 1)); // expand to 32 bits, because AVX2 cannot movemask across lanes to 16 bits
			}
		}
		pNonCodewordMasks[posMask] = tmpMask;
	}
	std::cout << "numMasks: " << numMasks << std::endl;
	std::cout << "non-codeword-masks: 0x" << std::hex << std::setfill('0');
	for (size_t posMask = 0; posMask < numMasks; ++posMask) {
		std::cout << std::setw(8) << pNonCodewordMasks[posMask] << ':';
	}
	std::cout << std::dec << std::endl << std::setfill(' ');
	auto mmCodewords = _mm256_set_epi16(c+15, c+14, c+13, c+12, c+11, c+10, c+9, c+8, c+7, c+6, c+5, c+4, c+3, c+2, c+1, c);
	auto mmAddUp = _mm256_set1_epi16(16);
	auto mmAinv = _mm256_set1_epi16(AInv);
	auto mmDmin = _mm256_set1_epi16(Dmin);
	auto mmDmax = _mm256_set1_epi16(Dmax);
	const size_t posEnd = (1ull << Cwidth);
	__m256i mmFillUp[] = {_mm256_set1_epi16(0), _mm256_set1_epi16(~((1ll << Cwidth) - 1))}; // fill up all non-codeword bits with 1's if necessary
	std::cout << "posEnd = 0x" << std::hex << posEnd << std::dec << std::endl;
	std::cout << std::setfill('0') << std::hex;
	for(size_t pos = 15, posMask = 0; pos < posEnd; pos += 16, ++posMask) {
		auto isNeg = 0x1 & _mm256_movemask_epi8(_mm256_cmpgt_epi16(mmFillUp[0], mmCodewords));
		auto mm1 = _mm256_or_si256(_mm256_mullo_epi16(mmCodewords, mmAinv), mmFillUp[isNeg]);
		auto mm2 = _mm256_cmpgt_epi16(mm1, mmDmin);
		auto mm3 = _mm256_cmpgt_epi16(mmDmax, mm1);
		auto mm4 = _mm256_cmpeq_epi16(mmDmax, mm1);
		auto mm5 = _mm256_or_si256(mm3, mm4);
		auto mm6 = _mm256_and_si256(mm2, mm5);
		auto mask = _mm256_movemask_epi8(mm6);
		if (mask & pNonCodewordMasks[posMask]) {
			std::cout << "BAD @0x" << std::setw((Cwidth + 7) / 8) << pos << ": 0x" << mask << " & 0x" << pNonCodewordMasks[posMask] << " = 0x" << (mask & pNonCodewordMasks[posMask]) << std::endl;
		} else {
			std::cout << "OK @0x" << std::setw((Cwidth + 7) / 8) << pos << ": 0x" << mask << " & 0x" << pNonCodewordMasks[posMask] << " = 0x" << (mask & pNonCodewordMasks[posMask]) << std::endl;
		}
		mmCodewords = _mm256_add_epi16(mmCodewords, mmAddUp);
	}
	std::cout << std::setfill(' ') << std::dec;
}
 template<bool align> SIMD_INLINE void FirstRow5x5(__m256i src, Buffer & buffer, size_t offset)
 {
     Store<align>((__m256i*)(buffer.in0 + offset), src);
     Store<align>((__m256i*)(buffer.in1 + offset), _mm256_mullo_epi16(src, K16_0005));
 }
Beispiel #10
0
__m256i test_mm256_mullo_epi16(__m256i a, __m256i b) {
  // CHECK: mul <16 x i16>
  return _mm256_mullo_epi16(a, b);
}
Beispiel #11
0
 /*!
  * \brief Multiply the two given vectors of short
  */
 ETL_STATIC_INLINE(avx_simd_short) mul(avx_simd_short lhs, avx_simd_short rhs) {
     return _mm256_mullo_epi16(lhs.value, rhs.value);
 }
static FORCE_INLINE void FlowInterExtra_8px_AVX2(
        int w, PixelType *pdst,
        const PixelType *prefB, const PixelType *prefF,
        const int16_t *VXFullB, const int16_t *VXFullF,
        const int16_t *VYFullB, const int16_t *VYFullF,
        const uint8_t *MaskB, const uint8_t *MaskF,
        int nPelLog,
        const int16_t *VXFullBB, const int16_t *VXFullFF,
        const int16_t *VYFullBB, const int16_t *VYFullFF,
        const __m256i &dwords_time256, const __m256i &dwords_256_time256,
        const __m256i &dwords_ref_pitch, const __m256i &dwords_hoffsets) {

    __m256i dwords_w = _mm256_add_epi32(_mm256_set1_epi32(w << nPelLog), dwords_hoffsets);

    __m256i dstF = lookup_AVX2(VXFullF, VYFullF, prefF, w, dwords_time256, dwords_ref_pitch, dwords_w);
    __m256i dstB = lookup_AVX2(VXFullB, VYFullB, prefB, w, dwords_256_time256, dwords_ref_pitch, dwords_w);
    __m256i dstFF = lookup_AVX2(VXFullFF, VYFullFF, prefF, w, dwords_time256, dwords_ref_pitch, dwords_w);
    __m256i dstBB = lookup_AVX2(VXFullBB, VYFullBB, prefB, w, dwords_256_time256, dwords_ref_pitch, dwords_w);

    __m256i minfb = mm256_min_epu<PixelType>(dstF, dstB);
    __m256i maxfb = mm256_max_epu<PixelType>(dstF, dstB);

    __m256i medianBB = mm256_max_epu<PixelType>(minfb, mm256_min_epu<PixelType>(maxfb, dstBB));
    __m256i medianFF = mm256_max_epu<PixelType>(minfb, mm256_min_epu<PixelType>(maxfb, dstFF));

    __m256i maskf = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskF[w]));
    __m256i maskb = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskB[w]));

    const __m256i dwords_255 = _mm256_set1_epi32(255);

    __m256i maskf_inv = _mm256_sub_epi32(dwords_255, maskf);
    __m256i maskb_inv = _mm256_sub_epi32(dwords_255, maskb);

    if (sizeof(PixelType) == 1) {
        dstF = _mm256_mullo_epi16(dstF, maskf_inv);
        dstB = _mm256_mullo_epi16(dstB, maskb_inv);

        medianBB = _mm256_mullo_epi16(medianBB, maskf);
        medianFF = _mm256_mullo_epi16(medianFF, maskb);
    } else {
        dstF = _mm256_mullo_epi32(dstF, maskf_inv);
        dstB = _mm256_mullo_epi32(dstB, maskb_inv);

        medianBB = _mm256_mullo_epi32(medianBB, maskf);
        medianFF = _mm256_mullo_epi32(medianFF, maskb);
    }

    dstF = _mm256_add_epi32(dstF, dwords_255);
    dstB = _mm256_add_epi32(dstB, dwords_255);

    dstF = _mm256_add_epi32(dstF, medianBB);
    dstB = _mm256_add_epi32(dstB, medianFF);

    dstF = _mm256_srai_epi32(dstF, 8);
    dstB = _mm256_srai_epi32(dstB, 8);

    if (sizeof(PixelType) == 2) {
        dstF = _mm256_sub_epi16(dstF, _mm256_set1_epi32(32768));
        dstB = _mm256_sub_epi16(dstB, _mm256_set1_epi32(32768));
    }
    dstF = _mm256_madd_epi16(dstF, dwords_256_time256);
    dstB = _mm256_madd_epi16(dstB, dwords_time256);
    if (sizeof(PixelType) == 2) {
//        dstF = _mm256_add_epi32(dstF, _mm256_slli_epi32(dwords_256_time256, 15));
//        dstB = _mm256_add_epi32(dstB, _mm256_slli_epi32(dwords_time256, 15));
        // Knowing that they add up to 256, the two additions can be combined.
        dstF = _mm256_add_epi32(dstF, _mm256_set1_epi32(256 << 15));
    }

    __m256i dst = _mm256_add_epi32(dstF, dstB);
    dst = _mm256_srai_epi32(dst, 8);

    dst = _mm256_packus_epi32(dst, dst);
    dst = _mm256_permute4x64_epi64(dst, 0xe8); // 0b11101000 - copy third qword to second qword
    __m128i dst128 = _mm256_castsi256_si128(dst);

    if (sizeof(PixelType) == 1) {
        dst128 = _mm_packus_epi16(dst128, dst128);
        _mm_storel_epi64((__m128i *)&pdst[w], dst128);
    } else {
        _mm_storeu_si128((__m128i *)&pdst[w], dst128);
    }
}
static FORCE_INLINE void FlowInter_8px_AVX2(
        int w, PixelType *pdst,
        const PixelType *prefB, const PixelType *prefF,
        const int16_t *VXFullB, const int16_t *VXFullF,
        const int16_t *VYFullB, const int16_t *VYFullF,
        const uint8_t *MaskB, const uint8_t *MaskF,
        int nPelLog,
        const __m256i &dwords_time256, const __m256i &dwords_256_time256,
        const __m256i &dwords_ref_pitch, const __m256i &dwords_hoffsets) {

    __m256i dwords_w = _mm256_add_epi32(_mm256_set1_epi32(w << nPelLog), dwords_hoffsets);

    __m256i dstF = lookup_AVX2(VXFullF, VYFullF, prefF, w, dwords_time256, dwords_ref_pitch, dwords_w);
    __m256i dstB = lookup_AVX2(VXFullB, VYFullB, prefB, w, dwords_256_time256, dwords_ref_pitch, dwords_w);

    __m256i dstF0 = _mm256_i32gather_epi32((const int *)prefF, dwords_w, sizeof(PixelType));
    __m256i dstB0 = _mm256_i32gather_epi32((const int *)prefB, dwords_w, sizeof(PixelType));
    dstF0 = _mm256_and_si256(dstF0, _mm256_set1_epi32((1 << (sizeof(PixelType) * 8)) - 1));
    dstB0 = _mm256_and_si256(dstB0, _mm256_set1_epi32((1 << (sizeof(PixelType) * 8)) - 1));

    __m256i maskf = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskF[w]));
    __m256i maskb = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskB[w]));

    const __m256i dwords_255 = _mm256_set1_epi32(255);

    __m256i maskf_inv = _mm256_sub_epi32(dwords_255, maskf);
    __m256i maskb_inv = _mm256_sub_epi32(dwords_255, maskb);

    __m256i dstF_maskf_inv, dstB_maskb_inv, dstF0_maskb, dstB0_maskf;

    if (sizeof(PixelType) == 1) {
        dstF_maskf_inv = _mm256_mullo_epi16(dstF, maskf_inv);
        dstB_maskb_inv = _mm256_mullo_epi16(dstB, maskb_inv);

        dstF0_maskb = _mm256_mullo_epi16(dstF0, maskb);
        dstB0_maskf = _mm256_mullo_epi16(dstB0, maskf);
    } else {
        dstF_maskf_inv = _mm256_mullo_epi32(dstF, maskf_inv);
        dstB_maskb_inv = _mm256_mullo_epi32(dstB, maskb_inv);

        dstF0_maskb = _mm256_mullo_epi32(dstF0, maskb);
        dstB0_maskf = _mm256_mullo_epi32(dstB0, maskf);
    }

    __m256i f = _mm256_add_epi32(dstF0_maskb, dstB_maskb_inv);
    __m256i b = _mm256_add_epi32(dstB0_maskf, dstF_maskf_inv);

    if (sizeof(PixelType) == 1) {
        f = _mm256_mullo_epi32(f, maskf);
        b = _mm256_mullo_epi32(b, maskb);

        f = _mm256_add_epi32(f, dwords_255);
        b = _mm256_add_epi32(b, dwords_255);

        f = _mm256_srai_epi32(f, 8);
        b = _mm256_srai_epi32(b, 8);
    } else {
        const __m256i qwords_255 = _mm256_set1_epi64x(255);

        __m256i tempf = _mm256_mul_epu32(f, maskf);
        __m256i tempb = _mm256_mul_epu32(b, maskb);
        tempf = _mm256_add_epi64(tempf, qwords_255);
        tempb = _mm256_add_epi64(tempb, qwords_255);
        tempf = _mm256_srli_epi64(tempf, 8);
        tempb = _mm256_srli_epi64(tempb, 8);

        f = _mm256_srli_epi64(f, 32);
        b = _mm256_srli_epi64(b, 32);
        f = _mm256_mul_epu32(f, _mm256_srli_epi64(maskf, 32));
        b = _mm256_mul_epu32(b, _mm256_srli_epi64(maskb, 32));
        f = _mm256_add_epi64(f, qwords_255);
        b = _mm256_add_epi64(b, qwords_255);
        f = _mm256_srli_epi64(f, 8);
        b = _mm256_srli_epi64(b, 8);
        f = _mm256_or_si256(tempf, _mm256_slli_epi64(f, 32));
        b = _mm256_or_si256(tempb, _mm256_slli_epi64(b, 32));
    }

    f = _mm256_add_epi32(f, dstF_maskf_inv);
    b = _mm256_add_epi32(b, dstB_maskb_inv);

    f = _mm256_add_epi32(f, dwords_255);
    b = _mm256_add_epi32(b, dwords_255);

    f = _mm256_srai_epi32(f, 8);
    b = _mm256_srai_epi32(b, 8);

    if (sizeof(PixelType) == 1) {
        f = _mm256_madd_epi16(f, dwords_256_time256);
        b = _mm256_madd_epi16(b, dwords_time256);
    } else {
        f = _mm256_mullo_epi32(f, dwords_256_time256);
        b = _mm256_mullo_epi32(b, dwords_time256);
    }

    __m256i dst = _mm256_add_epi32(f, b);
    dst = _mm256_srai_epi32(dst, 8);

    dst = _mm256_packus_epi32(dst, dst);
    dst = _mm256_permute4x64_epi64(dst, 0xe8); // 0b11101000 - copy third qword to second qword
    __m128i dst128 = _mm256_castsi256_si128(dst);

    if (sizeof(PixelType) == 1) {
        dst128 = _mm_packus_epi16(dst128, dst128);
        _mm_storel_epi64((__m128i *)&pdst[w], dst128);
    } else {
        _mm_storeu_si128((__m128i *)&pdst[w], dst128);
    }
}
Beispiel #14
0
 template<bool align> SIMD_INLINE __m256i InterpolateY(const __m256i * pbx0, const __m256i * pbx1, __m256i alpha[2])
 {
     __m256i sum = _mm256_add_epi16(_mm256_mullo_epi16(Load<align>(pbx0), alpha[0]), _mm256_mullo_epi16(Load<align>(pbx1), alpha[1]));
     return _mm256_srli_epi16(_mm256_add_epi16(sum, K16_FRACTION_ROUND_TERM), Base::BILINEAR_SHIFT);
 }
Beispiel #15
0
void fft64_2way( void *a )
{
  __m256i* const A = a;
  register __m256i X0, X1, X2, X3, X4, X5, X6, X7;

#define X(i) X##i

  X0 = A[0];
  X1 = A[1];
  X2 = A[2];
  X3 = A[3];
  X4 = A[4];
  X5 = A[5];
  X6 = A[6];
  X7 = A[7];

#define DO_REDUCE(i)   X(i) = REDUCE( X(i) )

   // Begin with 8 parallels DIF FFT_8
   //
   // FFT_8 using w=4 as 8th root of unity
   //  Unrolled decimation in frequency (DIF) radix-2 NTT.
   //  Output data is in revbin_permuted order.

  static const int w[] = {0, 2, 4, 6};
//   __m256i *Twiddle = (__m256i*)FFT64_Twiddle;


#define BUTTERFLY_0( i,j ) \
do { \
    __m256i v = X(j); \
    X(j) = _mm256_add_epi16( X(i), X(j) ); \
    X(i) = _mm256_sub_epi16( X(i), v ); \
} while(0)

#define BUTTERFLY_N( i,j,n ) \
do { \
    __m256i v = X(j); \
    X(j) = _mm256_add_epi16( X(i), X(j) ); \
    X(i) = _mm256_slli_epi16( _mm256_sub_epi16( X(i), v ), w[n] ); \
} while(0)

  BUTTERFLY_0( 0, 4 );
  BUTTERFLY_N( 1, 5, 1 );
  BUTTERFLY_N( 2, 6, 2 );
  BUTTERFLY_N( 3, 7, 3 );

  DO_REDUCE( 2 );
  DO_REDUCE( 3 );

  BUTTERFLY_0( 0, 2 );
  BUTTERFLY_0( 4, 6 );
  BUTTERFLY_N( 1, 3, 2 );
  BUTTERFLY_N( 5, 7, 2 );

  DO_REDUCE( 1 );

  BUTTERFLY_0( 0, 1 );
  BUTTERFLY_0( 2, 3 );
  BUTTERFLY_0( 4, 5 );
  BUTTERFLY_0( 6, 7 );

  /* We don't need to reduce X(7) */
  DO_REDUCE_FULL_S( 0 );
  DO_REDUCE_FULL_S( 1 );
  DO_REDUCE_FULL_S( 2 );
  DO_REDUCE_FULL_S( 3 );
  DO_REDUCE_FULL_S( 4 );
  DO_REDUCE_FULL_S( 5 );
  DO_REDUCE_FULL_S( 6 );

#undef BUTTERFLY_0
#undef BUTTERFLY_N

  // Multiply by twiddle factors
  X(6) = _mm256_mullo_epi16( X(6), FFT64_Twiddle[0].m256i );
  X(5) = _mm256_mullo_epi16( X(5), FFT64_Twiddle[1].m256i );
  X(4) = _mm256_mullo_epi16( X(4), FFT64_Twiddle[2].m256i );
  X(3) = _mm256_mullo_epi16( X(3), FFT64_Twiddle[3].m256i );
  X(2) = _mm256_mullo_epi16( X(2), FFT64_Twiddle[4].m256i );
  X(1) = _mm256_mullo_epi16( X(1), FFT64_Twiddle[5].m256i );
  X(0) = _mm256_mullo_epi16( X(0), FFT64_Twiddle[6].m256i );

  // Transpose the FFT state with a revbin order permutation
  // on the rows and the column.
  // This will make the full FFT_64 in order.
#define INTERLEAVE(i,j) \
  do { \
    __m256i t1= X(i); \
    __m256i t2= X(j); \
    X(i) = _mm256_unpacklo_epi16( t1, t2 ); \
    X(j) = _mm256_unpackhi_epi16( t1, t2 ); \
  } while(0)

  INTERLEAVE( 1, 0 );
  INTERLEAVE( 3, 2 );
  INTERLEAVE( 5, 4 );
  INTERLEAVE( 7, 6 );

  INTERLEAVE( 2, 0 );
  INTERLEAVE( 3, 1 );
  INTERLEAVE( 6, 4 );
  INTERLEAVE( 7, 5 );

  INTERLEAVE( 4, 0 );
  INTERLEAVE( 5, 1 );
  INTERLEAVE( 6, 2 );
  INTERLEAVE( 7, 3 );

#undef INTERLEAVE

   //Finish with 8 parallels DIT FFT_8
   //FFT_8 using w=4 as 8th root of unity
   // Unrolled decimation in time (DIT) radix-2 NTT.
   // Input data is in revbin_permuted order.

#define BUTTERFLY_0( i,j ) \
do { \
   __m256i u = X(j); \
   X(j) = _mm256_sub_epi16( X(j), X(i) ); \
   X(i) = _mm256_add_epi16( u, X(i) ); \
} while(0)


#define BUTTERFLY_N( i,j,n ) \
do { \
   __m256i u = X(j); \
   X(i) = _mm256_slli_epi16( X(i), w[n] ); \
   X(j) = _mm256_sub_epi16( X(j), X(i) ); \
   X(i) = _mm256_add_epi16( u, X(i) ); \
} while(0)

  DO_REDUCE( 0 );
  DO_REDUCE( 1 );
  DO_REDUCE( 2 );
  DO_REDUCE( 3 );
  DO_REDUCE( 4 );
  DO_REDUCE( 5 );
  DO_REDUCE( 6 );
  DO_REDUCE( 7 );

  BUTTERFLY_0( 0, 1 );
  BUTTERFLY_0( 2, 3 );
  BUTTERFLY_0( 4, 5 );
  BUTTERFLY_0( 6, 7 );

  BUTTERFLY_0( 0, 2 );
  BUTTERFLY_0( 4, 6 );
  BUTTERFLY_N( 1, 3, 2 );
  BUTTERFLY_N( 5, 7, 2 );

  DO_REDUCE( 3 );

  BUTTERFLY_0( 0, 4 );
  BUTTERFLY_N( 1, 5, 1 );
  BUTTERFLY_N( 2, 6, 2 );
  BUTTERFLY_N( 3, 7, 3 );

  DO_REDUCE_FULL_S( 0 );
  DO_REDUCE_FULL_S( 1 );
  DO_REDUCE_FULL_S( 2 );
  DO_REDUCE_FULL_S( 3 );
  DO_REDUCE_FULL_S( 4 );
  DO_REDUCE_FULL_S( 5 );
  DO_REDUCE_FULL_S( 6 );
  DO_REDUCE_FULL_S( 7 );

#undef BUTTERFLY

  A[0] = X0;
  A[1] = X1;
  A[2] = X2;
  A[3] = X3;
  A[4] = X4;
  A[5] = X5;
  A[6] = X6;
  A[7] = X7;

#undef X
}
 SIMD_INLINE __m256i BinomialSum16(const __m256i & a, const __m256i & b, const __m256i & c, const __m256i & d)
 {
     return _mm256_add_epi16(_mm256_add_epi16(a, d), _mm256_mullo_epi16(_mm256_add_epi16(b, c), K16_0003));
 }