template <bool align> SIMD_INLINE uint8x8_t ReduceRow(const Buffer & buffer, size_t offset) { return vmovn_u16(DivideBy64(BinomialSum16( Load<align>(buffer.src0 + offset), Load<align>(buffer.src1 + offset), Load<align>(buffer.src2 + offset), Load<align>(buffer.src3 + offset)))); }
template <bool align> SIMD_INLINE __m128i ReduceRow(const Buffer & buffer, size_t offset) { return _mm_packus_epi16(_mm_and_si128(DivideBy64(BinomialSum16( Load<align>((__m128i*)(buffer.src0 + offset)), Load<align>((__m128i*)(buffer.src1 + offset)), Load<align>((__m128i*)(buffer.src2 + offset)), Load<align>((__m128i*)(buffer.src3 + offset)))), K16_00FF), K_ZERO); }
template <bool align> SIMD_INLINE __m256i ReduceRow16(const Buffer & buffer, size_t offset) { return _mm256_and_si256(DivideBy64(BinomialSum16( Load<align>((__m256i*)(buffer.src0 + offset)), Load<align>((__m256i*)(buffer.src1 + offset)), Load<align>((__m256i*)(buffer.src2 + offset)), Load<align>((__m256i*)(buffer.src3 + offset)))), K16_00FF); }