template<bool align> SIMD_INLINE void BlurCol(__m128i a[3], uint16_t * b) { Store<align>((__m128i*)(b + 0), BinomialSum16(_mm_unpacklo_epi8(a[0], K_ZERO), _mm_unpacklo_epi8(a[1], K_ZERO), _mm_unpacklo_epi8(a[2], K_ZERO))); Store<align>((__m128i*)(b + HA), BinomialSum16(_mm_unpackhi_epi8(a[0], K_ZERO), _mm_unpackhi_epi8(a[1], K_ZERO), _mm_unpackhi_epi8(a[2], K_ZERO))); }
template<bool align> SIMD_INLINE __m128i BlurRow16(const Buffer & buffer, size_t offset) { return DivideBy16(BinomialSum16( Load<align>((__m128i*)(buffer.src0 + offset)), Load<align>((__m128i*)(buffer.src1 + offset)), Load<align>((__m128i*)(buffer.src2 + offset)))); }
template <> SIMD_INLINE __m256i ReduceColTail<false>(const uint8_t * src) { const __m256i t0 = _mm256_loadu_si256((__m256i*)(src - 1)); __m256i t1, t2; LoadAfterLast<false, 1>(src - 1, t1, t2); return BinomialSum16(t0, t2); }
SIMD_INLINE __m128i ReduceColBody(const uint8_t *src) { const __m128i t0 = _mm_loadu_si128((__m128i*)(src - 1)); const __m128i t2 = _mm_loadu_si128((__m128i*)(src + 1)); return BinomialSum16( _mm_and_si128(t0, K16_00FF), _mm_and_si128(_mm_srli_si128(t0, 1), K16_00FF), _mm_and_si128(t2, K16_00FF), _mm_and_si128(_mm_srli_si128(t2, 1), K16_00FF)); }
SIMD_INLINE __m128i ReduceColNose(const uint8_t *src) { const __m128i t1 = _mm_loadu_si128((__m128i*)src); const __m128i t2 = _mm_loadu_si128((__m128i*)(src + 1)); return BinomialSum16( _mm_and_si128(LoadBeforeFirst<1>(t1), K16_00FF), _mm_and_si128(t1, K16_00FF), _mm_and_si128(t2, K16_00FF), _mm_and_si128(_mm_srli_si128(t2, 1), K16_00FF)); }
template <> SIMD_INLINE __m128i ReduceColTail<false>(const uint8_t *src) { const __m128i t0 = _mm_loadu_si128((__m128i*)(src - 1)); const __m128i t1 = LoadAfterLast<1>(t0); const __m128i t2 = LoadAfterLast<1>(t1); return BinomialSum16( _mm_and_si128(t0, K16_00FF), _mm_and_si128(t1, K16_00FF), _mm_and_si128(t2, K16_00FF), _mm_and_si128(_mm_srli_si128(t2, 1), K16_00FF)); }
template <bool align> SIMD_INLINE __m128i ReduceRow(const Buffer & buffer, size_t offset) { return _mm_packus_epi16(_mm_and_si128(DivideBy64(BinomialSum16( Load<align>((__m128i*)(buffer.src0 + offset)), Load<align>((__m128i*)(buffer.src1 + offset)), Load<align>((__m128i*)(buffer.src2 + offset)), Load<align>((__m128i*)(buffer.src3 + offset)))), K16_00FF), K_ZERO); }
template <bool align> SIMD_INLINE uint8x8_t ReduceRow(const Buffer & buffer, size_t offset) { return vmovn_u16(DivideBy64(BinomialSum16( Load<align>(buffer.src0 + offset), Load<align>(buffer.src1 + offset), Load<align>(buffer.src2 + offset), Load<align>(buffer.src3 + offset)))); }
template <> SIMD_INLINE __m128i ReduceColTail<false>(const uint8_t *src) { const __m128i t0 = _mm_loadu_si128((__m128i*)(src - 1)); return BinomialSum16(t0, LoadAfterLast<1>(LoadAfterLast<1>(t0))); }
SIMD_INLINE __m128i ReduceColBody(const uint8_t *src) { const __m128i t0 = _mm_loadu_si128((__m128i*)(src - 1)); const __m128i t2 = _mm_loadu_si128((__m128i*)(src + 1)); return BinomialSum16(t0, t2); }
SIMD_INLINE __m128i ReduceColNose(const uint8_t *src) { const __m128i t1 = _mm_loadu_si128((__m128i*)src); const __m128i t2 = _mm_loadu_si128((__m128i*)(src + 1)); return BinomialSum16(LoadBeforeFirst<1>(t1), t2); }
template <bool compensation> SIMD_INLINE uint8x8_t ReduceRow(const uint16x8_t & r0, const uint16x8_t & r1, const uint16x8_t & r2) { return vmovn_u16(DivideBy16<compensation>(BinomialSum16(r0, r1, r2))); }
SIMD_INLINE __m256i ReduceColBody(const uint8_t * src) { const __m256i t0 = _mm256_loadu_si256((__m256i*)(src - 1)); const __m256i t2 = _mm256_loadu_si256((__m256i*)(src + 1)); return BinomialSum16(t0, t2); }
SIMD_INLINE __m256i ReduceColNose(const uint8_t * src) { const __m256i t2 = _mm256_loadu_si256((__m256i*)(src + 1)); return BinomialSum16(LoadBeforeFirst<false, 1>(src), t2); }
template <bool align> SIMD_INLINE __m256i ReduceRow16(const Buffer & buffer, size_t offset) { return _mm256_and_si256(DivideBy64(BinomialSum16( Load<align>((__m256i*)(buffer.src0 + offset)), Load<align>((__m256i*)(buffer.src1 + offset)), Load<align>((__m256i*)(buffer.src2 + offset)), Load<align>((__m256i*)(buffer.src3 + offset)))), K16_00FF); }