void ConditionalSquareGradientSum(const uint8_t * src, size_t srcStride, size_t width, size_t height,
            const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum)
        {
            src += srcStride + 1;
            mask += maskStride + 1;
            width -= 2;
            height -= 2;

            *sum = 0;
            for (size_t row = 0; row < height; ++row)
            {
                uint32_t rowSum = 0;
                for (size_t col = 0; col < width; ++col)
                {
                    if (Compare8u<compareType>(mask[col], value))
                    {
                        rowSum += SquaredDifference(src[col + 1], src[col - 1]);
                        rowSum += SquaredDifference(src[col + srcStride], src[col - srcStride]);
                    }
                }
                *sum += rowSum;
                src += srcStride;
                mask += maskStride;
            }
        }
		template <bool align> void SquaredDifferenceSum(
			const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, 
			size_t width, size_t height, uint64_t * sum)
		{
			assert(width < 0x10000);
			if(align)
			{
				assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride));
			}

			size_t bodyWidth = AlignLo(width, A);
			__m128i tailMask = ShiftLeft(K_INV_ZERO, A - width + bodyWidth);
			__m128i fullSum = _mm_setzero_si128();
			for(size_t row = 0; row < height; ++row)
			{
				__m128i rowSum = _mm_setzero_si128();
				for(size_t col = 0; col < bodyWidth; col += A)
				{
					const __m128i a_ = Load<align>((__m128i*)(a + col));
					const __m128i b_ = Load<align>((__m128i*)(b + col)); 
					rowSum = _mm_add_epi32(rowSum, SquaredDifference(a_, b_));
				}
				if(width - bodyWidth)
				{
					const __m128i a_ = _mm_and_si128(tailMask, Load<false>((__m128i*)(a + width - A)));
					const __m128i b_ = _mm_and_si128(tailMask, Load<false>((__m128i*)(b + width - A))); 
					rowSum = _mm_add_epi32(rowSum, SquaredDifference(a_, b_));
				}
				fullSum = _mm_add_epi64(fullSum, HorizontalSum32(rowSum));
				a += aStride;
				b += bStride;
			}
			*sum = ExtractInt64Sum(fullSum);
		}
		template <bool align> void SquaredDifferenceSumMasked(
			const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, 
			const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum)
		{
			assert(width < 0x10000);
			if(align)
			{
				assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride));
				assert(Aligned(mask) && Aligned(maskStride));
			}

			size_t bodyWidth = AlignLo(width, A);
			__m256i tailMask = SetMask<uint8_t>(0, A - width + bodyWidth, 0xFF);
			__m256i fullSum = _mm256_setzero_si256();
			__m256i index_= _mm256_set1_epi8(index);
			for(size_t row = 0; row < height; ++row)
			{
				__m256i rowSum = _mm256_setzero_si256();
				for(size_t col = 0; col < bodyWidth; col += A)
				{
					const __m256i mask_ = LoadMaskI8<align>((__m256i*)(mask + col), index_);
					const __m256i a_ = _mm256_and_si256(mask_, Load<align>((__m256i*)(a + col)));
					const __m256i b_ = _mm256_and_si256(mask_, Load<align>((__m256i*)(b + col))); 
					rowSum = _mm256_add_epi32(rowSum, SquaredDifference(a_, b_));
				}
				if(width - bodyWidth)
				{
					const __m256i mask_ = _mm256_and_si256(tailMask, LoadMaskI8<false>((__m256i*)(mask + width - A), index_));
					const __m256i a_ = _mm256_and_si256(mask_, Load<false>((__m256i*)(a + width - A)));
					const __m256i b_ = _mm256_and_si256(mask_, Load<false>((__m256i*)(b + width - A))); 
					rowSum = _mm256_add_epi32(rowSum, SquaredDifference(a_, b_));
				}
				fullSum = _mm256_add_epi64(fullSum, HorizontalSum32(rowSum));
				a += aStride;
				b += bStride;
				mask += maskStride;
			}
			*sum = ExtractSum<uint64_t>(fullSum);
		}
		void SquaredDifferenceSum(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, 
			size_t width, size_t height, uint64_t * sum)
		{
			assert(width < 0x10000);

			*sum = 0;
			for(size_t row = 0; row < height; ++row)
			{
				int rowSum = 0;
				for(size_t col = 0; col < width; ++col)
				{
					rowSum += SquaredDifference(a[col], b[col]);
				}
				*sum += rowSum;
				a += aStride;
				b += bStride;
			}
		}
		void SquaredDifferenceSumMasked(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, 
			const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum)
		{
			assert(width < 0x10000);

			*sum = 0;
			for(size_t row = 0; row < height; ++row)
			{
				int rowSum = 0;
				for(size_t col = 0; col < width; ++col)
				{
					if(mask[col] == index)
						rowSum += SquaredDifference(a[col], b[col]);
				}
				*sum += rowSum;
				a += aStride;
				b += bStride;
				mask += maskStride;
			}
		}