SIMD_INLINE void AddSquareDifference(const uint8_t * src, ptrdiff_t step, const v128_u8 & mask, v128_u32 & sum)
 {
     const v128_u8 a = Load<align>(src - step);
     const v128_u8 b = Load<align>(src + step);
     const v128_u8 d = vec_and(AbsDifferenceU8(a, b), mask);
     sum = vec_msum(d, d, sum);
 }
 SIMD_INLINE __m256i AbsSecondDerivative(const uint8_t * src, ptrdiff_t step)
 {
     const __m256i s0 = Load<srcAlign && stepAlign>((__m256i*)(src - step));
     const __m256i s1 = Load<srcAlign>((__m256i*)src);
     const __m256i s2 = Load<srcAlign && stepAlign>((__m256i*)(src + step));
     return AbsDifferenceU8(_mm256_avg_epu8(s0, s2), s1);
 }
		void AbsGradientSaturatedSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride)
		{
			memset(dst, 0, width);
			src += srcStride;
			dst += dstStride;
			for (size_t row = 2; row < height; ++row)
			{
				dst[0] = 0;
				for (size_t col = 1; col < width - 1; ++col)
				{
					const int dy = AbsDifferenceU8(src[col - srcStride], src[col + srcStride]);
					const int dx = AbsDifferenceU8(src[col - 1], src[col + 1]);
					dst[col] = MinU8(dx + dy, 0xFF);
				}
				dst[width - 1] = 0;

				src += srcStride;
				dst += dstStride;
			}
			memset(dst, 0, width);
		}
        void AbsDifferenceSums3x3Masked(const uint8_t *current, size_t currentStride, const uint8_t *background, size_t backgroundStride,
            const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sums)
        {
            assert(width > 2 && height > 2);

            for(size_t i = 0; i < 9; ++i)
                sums[i] = 0;

            height -= 2;
            width -= 2;
            current += 1 + currentStride;
            background += 1 + backgroundStride;
            mask += 1 + maskStride;
            for(size_t row = 0; row < height; ++row)
            {
                int rowSums[9];
                for(size_t i = 0; i < 9; ++i)
                    rowSums[i] = 0;

                for(size_t col = 0; col < width; ++col)
                {
                    if(mask[col] == index)
                    {
                        int value = current[col];
                        rowSums[0] += AbsDifferenceU8(value, background[col - backgroundStride - 1]);
                        rowSums[1] += AbsDifferenceU8(value, background[col - backgroundStride]);
                        rowSums[2] += AbsDifferenceU8(value, background[col - backgroundStride + 1]);
                        rowSums[3] += AbsDifferenceU8(value, background[col - 1]);
                        rowSums[4] += AbsDifferenceU8(value, background[col]);
                        rowSums[5] += AbsDifferenceU8(value, background[col + 1]);
                        rowSums[6] += AbsDifferenceU8(value, background[col + backgroundStride - 1]);
                        rowSums[7] += AbsDifferenceU8(value, background[col + backgroundStride]);
                        rowSums[8] += AbsDifferenceU8(value, background[col + backgroundStride + 1]);
                    }
                }

                for(size_t i = 0; i < 9; ++i)
                    sums[i] += rowSums[i];

                current += currentStride;
                background += backgroundStride;
                mask += maskStride;
            }
        }
		void AbsDifferenceSum(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, 
			size_t width, size_t height, uint64_t * sum)
		{
			*sum = 0;
			for(size_t row = 0; row < height; ++row)
			{
				int rowSum = 0;
				for(size_t col = 0; col < width; ++col)
				{
					rowSum += AbsDifferenceU8(a[col], b[col]);
				}
				*sum += rowSum;
				a += aStride;
				b += bStride;
			}
		}
		void AbsDifferenceSumMasked(const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, 
			const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum)
		{
			*sum = 0;
			for(size_t row = 0; row < height; ++row)
			{
				int rowSum = 0;
				for(size_t col = 0; col < width; ++col)
				{
					if(mask[col] == index)
						rowSum += AbsDifferenceU8(a[col], b[col]);
				}
				*sum += rowSum;
				a += aStride;
				b += bStride;
				mask += maskStride;
			}
		}
 template <bool align> void AbsDifferenceSumMasked(const uint8_t * a, const uint8_t *b, size_t offset, const v128_u8 & mask, v128_u32 & sum)
 {
     const v128_u8 _a = vec_and(Load<align>(a + offset), mask);
     const v128_u8 _b = vec_and(Load<align>(b + offset), mask);
     sum = vec_msum(AbsDifferenceU8(_a, _b), K8_01, sum);
 }
 template <bool align> void AbsDifferenceSums3Masked(const v128_u8 & current, const uint8_t * background, const v128_u8 & mask, v128_u32 sums[3])
 {
     sums[0] = vec_msum(AbsDifferenceU8(current, vec_and(mask, Load<align>(background - 1))), K8_01, sums[0]);
     sums[1] = vec_msum(AbsDifferenceU8(current, vec_and(mask, Load<false>(background))), K8_01, sums[1]);
     sums[2] = vec_msum(AbsDifferenceU8(current, vec_and(mask, Load<false>(background + 1))), K8_01, sums[2]);
 }
 template<bool align> SIMD_INLINE v128_u8 AbsGradientSaturatedSum(const uint8_t * src, size_t stride)
 {
     const v128_u8 dx = AbsDifferenceU8(Load<false>(src - 1), Load<false>(src + 1));
     const v128_u8 dy = AbsDifferenceU8(Load<align>(src - stride), Load<align>(src + stride));
     return vec_adds(dx, dy);
 }