template<bool align> SIMD_INLINE void BlurCol(__m128i a[3], uint16_t * b)
 {
     Store<align>((__m128i*)(b + 0), BinomialSum16(_mm_unpacklo_epi8(a[0], K_ZERO), 
         _mm_unpacklo_epi8(a[1], K_ZERO), _mm_unpacklo_epi8(a[2], K_ZERO)));
     Store<align>((__m128i*)(b + HA), BinomialSum16(_mm_unpackhi_epi8(a[0], K_ZERO), 
         _mm_unpackhi_epi8(a[1], K_ZERO), _mm_unpackhi_epi8(a[2], K_ZERO)));
 }
 template<bool align> SIMD_INLINE __m128i BlurRow16(const Buffer & buffer, size_t offset)
 {
     return DivideBy16(BinomialSum16(
         Load<align>((__m128i*)(buffer.src0 + offset)), 
         Load<align>((__m128i*)(buffer.src1 + offset)),
         Load<align>((__m128i*)(buffer.src2 + offset))));
 }
 template <> SIMD_INLINE __m256i ReduceColTail<false>(const uint8_t * src)
 {
     const __m256i t0 = _mm256_loadu_si256((__m256i*)(src - 1));
     __m256i t1, t2;
     LoadAfterLast<false, 1>(src - 1, t1, t2);
     return BinomialSum16(t0, t2);
 }
 SIMD_INLINE __m128i ReduceColBody(const uint8_t *src)
 {
     const __m128i t0 = _mm_loadu_si128((__m128i*)(src - 1));
     const __m128i t2 = _mm_loadu_si128((__m128i*)(src + 1));
     return BinomialSum16(
         _mm_and_si128(t0, K16_00FF),
         _mm_and_si128(_mm_srli_si128(t0, 1), K16_00FF),
         _mm_and_si128(t2, K16_00FF),
         _mm_and_si128(_mm_srli_si128(t2, 1), K16_00FF));
 }
 SIMD_INLINE __m128i ReduceColNose(const uint8_t *src)
 {
     const __m128i t1 = _mm_loadu_si128((__m128i*)src);
     const __m128i t2 = _mm_loadu_si128((__m128i*)(src + 1));
     return BinomialSum16(
         _mm_and_si128(LoadBeforeFirst<1>(t1), K16_00FF),
         _mm_and_si128(t1, K16_00FF),
         _mm_and_si128(t2, K16_00FF),
         _mm_and_si128(_mm_srli_si128(t2, 1), K16_00FF));
 }
 template <> SIMD_INLINE __m128i ReduceColTail<false>(const uint8_t *src)
 {
     const __m128i t0 = _mm_loadu_si128((__m128i*)(src - 1));
     const __m128i t1 = LoadAfterLast<1>(t0);
     const __m128i t2 = LoadAfterLast<1>(t1);
     return BinomialSum16(
         _mm_and_si128(t0, K16_00FF),
         _mm_and_si128(t1, K16_00FF),
         _mm_and_si128(t2, K16_00FF),
         _mm_and_si128(_mm_srli_si128(t2, 1), K16_00FF));
 }
 template <bool align> SIMD_INLINE __m128i ReduceRow(const Buffer & buffer, size_t offset)
 {
     return _mm_packus_epi16(_mm_and_si128(DivideBy64(BinomialSum16(
         Load<align>((__m128i*)(buffer.src0 + offset)), Load<align>((__m128i*)(buffer.src1 + offset)),
         Load<align>((__m128i*)(buffer.src2 + offset)), Load<align>((__m128i*)(buffer.src3 + offset)))), K16_00FF), K_ZERO);
 }
Beispiel #8
0
		template <bool align> SIMD_INLINE uint8x8_t ReduceRow(const Buffer & buffer, size_t offset)
		{
			return vmovn_u16(DivideBy64(BinomialSum16(
				Load<align>(buffer.src0 + offset), Load<align>(buffer.src1 + offset),
				Load<align>(buffer.src2 + offset), Load<align>(buffer.src3 + offset))));
		}
 template <> SIMD_INLINE __m128i ReduceColTail<false>(const uint8_t *src)
 {
     const __m128i t0 = _mm_loadu_si128((__m128i*)(src - 1));
     return BinomialSum16(t0, LoadAfterLast<1>(LoadAfterLast<1>(t0)));
 }
 SIMD_INLINE __m128i ReduceColBody(const uint8_t *src)
 {
     const __m128i t0 = _mm_loadu_si128((__m128i*)(src - 1));
     const __m128i t2 = _mm_loadu_si128((__m128i*)(src + 1));
     return BinomialSum16(t0, t2);
 }
 SIMD_INLINE __m128i ReduceColNose(const uint8_t *src)
 {
     const __m128i t1 = _mm_loadu_si128((__m128i*)src);
     const __m128i t2 = _mm_loadu_si128((__m128i*)(src + 1));
     return BinomialSum16(LoadBeforeFirst<1>(t1), t2);
 }
 template <bool compensation> SIMD_INLINE uint8x8_t ReduceRow(const uint16x8_t & r0, const uint16x8_t & r1, const uint16x8_t & r2)
 {
     return vmovn_u16(DivideBy16<compensation>(BinomialSum16(r0, r1, r2)));
 }
 SIMD_INLINE __m256i ReduceColBody(const uint8_t * src)
 {
     const __m256i t0 = _mm256_loadu_si256((__m256i*)(src - 1));
     const __m256i t2 = _mm256_loadu_si256((__m256i*)(src + 1));
     return BinomialSum16(t0, t2);
 }
 SIMD_INLINE __m256i ReduceColNose(const uint8_t * src)
 {
     const __m256i t2 = _mm256_loadu_si256((__m256i*)(src + 1));
     return BinomialSum16(LoadBeforeFirst<false, 1>(src), t2);
 }
 template <bool align> SIMD_INLINE __m256i ReduceRow16(const Buffer & buffer, size_t offset)
 {
     return _mm256_and_si256(DivideBy64(BinomialSum16(
         Load<align>((__m256i*)(buffer.src0 + offset)), Load<align>((__m256i*)(buffer.src1 + offset)),
         Load<align>((__m256i*)(buffer.src2 + offset)), Load<align>((__m256i*)(buffer.src3 + offset)))), K16_00FF);
 }