void ConditionalSquareGradientSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum) { src += srcStride + 1; mask += maskStride + 1; width -= 2; height -= 2; *sum = 0; for (size_t row = 0; row < height; ++row) { uint32_t rowSum = 0; for (size_t col = 0; col < width; ++col) { if (Compare8u<compareType>(mask[col], value)) { rowSum += SquaredDifference(src[col + 1], src[col - 1]); rowSum += SquaredDifference(src[col + srcStride], src[col - srcStride]); } } *sum += rowSum; src += srcStride; mask += maskStride; } }
template <bool align> void SquaredDifferenceSum( const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, size_t width, size_t height, uint64_t * sum) { assert(width < 0x10000); if(align) { assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)); } size_t bodyWidth = AlignLo(width, A); __m128i tailMask = ShiftLeft(K_INV_ZERO, A - width + bodyWidth); __m128i fullSum = _mm_setzero_si128(); for(size_t row = 0; row < height; ++row) { __m128i rowSum = _mm_setzero_si128(); for(size_t col = 0; col < bodyWidth; col += A) { const __m128i a_ = Load<align>((__m128i*)(a + col)); const __m128i b_ = Load<align>((__m128i*)(b + col)); rowSum = _mm_add_epi32(rowSum, SquaredDifference(a_, b_)); } if(width - bodyWidth) { const __m128i a_ = _mm_and_si128(tailMask, Load<false>((__m128i*)(a + width - A))); const __m128i b_ = _mm_and_si128(tailMask, Load<false>((__m128i*)(b + width - A))); rowSum = _mm_add_epi32(rowSum, SquaredDifference(a_, b_)); } fullSum = _mm_add_epi64(fullSum, HorizontalSum32(rowSum)); a += aStride; b += bStride; } *sum = ExtractInt64Sum(fullSum); }
template <bool align> void SquaredDifferenceSumMasked( const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum) { assert(width < 0x10000); if(align) { assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)); assert(Aligned(mask) && Aligned(maskStride)); } size_t bodyWidth = AlignLo(width, A); __m256i tailMask = SetMask<uint8_t>(0, A - width + bodyWidth, 0xFF); __m256i fullSum = _mm256_setzero_si256(); __m256i index_= _mm256_set1_epi8(index); for(size_t row = 0; row < height; ++row) { __m256i rowSum = _mm256_setzero_si256(); for(size_t col = 0; col < bodyWidth; col += A) { const __m256i mask_ = LoadMaskI8<align>((__m256i*)(mask + col), index_); const __m256i a_ = _mm256_and_si256(mask_, Load<align>((__m256i*)(a + col))); const __m256i b_ = _mm256_and_si256(mask_, Load<align>((__m256i*)(b + col))); rowSum = _mm256_add_epi32(rowSum, SquaredDifference(a_, b_)); } if(width - bodyWidth) { const __m256i mask_ = _mm256_and_si256(tailMask, LoadMaskI8<false>((__m256i*)(mask + width - A), index_)); const __m256i a_ = _mm256_and_si256(mask_, Load<false>((__m256i*)(a + width - A))); const __m256i b_ = _mm256_and_si256(mask_, Load<false>((__m256i*)(b + width - A))); rowSum = _mm256_add_epi32(rowSum, SquaredDifference(a_, b_)); } fullSum = _mm256_add_epi64(fullSum, HorizontalSum32(rowSum)); a += aStride; b += bStride; mask += maskStride; } *sum = ExtractSum<uint64_t>(fullSum); }
void SquaredDifferenceSum(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, size_t width, size_t height, uint64_t * sum) { assert(width < 0x10000); *sum = 0; for(size_t row = 0; row < height; ++row) { int rowSum = 0; for(size_t col = 0; col < width; ++col) { rowSum += SquaredDifference(a[col], b[col]); } *sum += rowSum; a += aStride; b += bStride; } }
void SquaredDifferenceSumMasked(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum) { assert(width < 0x10000); *sum = 0; for(size_t row = 0; row < height; ++row) { int rowSum = 0; for(size_t col = 0; col < width; ++col) { if(mask[col] == index) rowSum += SquaredDifference(a[col], b[col]); } *sum += rowSum; a += aStride; b += bStride; mask += maskStride; } }