template <bool align> void LaplaceAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) { assert(width > A); if(align) assert(Aligned(src) && Aligned(stride)); size_t bodyWidth = Simd::AlignHi(width, A) - A; const uint8_t *src0, *src1, *src2; v128_u8 a[3][3]; v128_u8 tailMask = ShiftLeft(K8_FF, A - width + bodyWidth); *sum = 0; for(size_t row = 0; row < height; ++row) { src0 = src + stride*(row - 1); src1 = src0 + stride; src2 = src1 + stride; if(row == 0) src0 = src1; if(row == height - 1) src2 = src1; v128_u32 sums[2] = {K32_00000000, K32_00000000}; LoadNose3<align, 1>(src0 + 0, a[0]); LoadNose3<align, 1>(src1 + 0, a[1]); LoadNose3<align, 1>(src2 + 0, a[2]); LaplaceAbsSum(a, sums); for(size_t col = A; col < bodyWidth; col += A) { LoadBody3<align, 1>(src0 + col, a[0]); LoadBody3<align, 1>(src1 + col, a[1]); LoadBody3<align, 1>(src2 + col, a[2]); LaplaceAbsSum(a, sums); } LoadTail3<false, 1>(src0 + width - A, a[0]); LoadTail3<false, 1>(src1 + width - A, a[1]); LoadTail3<false, 1>(src2 + width - A, a[2]); SetMask3x3(a, tailMask); LaplaceAbsSum(a, sums); *sum += ExtractSum(vec_add(sums[0], sums[1])); } }
void SobelDxAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) { assert(width > A); size_t bodyWidth = Simd::AlignHi(width, A) - A; const uint8_t *src0, *src1, *src2; v16u8 a[3][3]; v2u64 fullSum = Zero<v2u64>(); const v16u8 K8_FF = Fill((uint8_t)0xff); v16u8 tailMask = ShiftLeft(K8_FF, A - width + bodyWidth); for (size_t row = 0; row < height; ++row) { src0 = src + stride*(row - 1); src1 = src0 + stride; src2 = src1 + stride; if (row == 0) src0 = src1; if (row == height - 1) src2 = src1; v4u32 rowSum = Zero<v4u32>(); LoadNoseDx(src0 + 0, a[0]); LoadNoseDx(src1 + 0, a[1]); LoadNoseDx(src2 + 0, a[2]); SobelDxAbsSum(a, rowSum); for (size_t col = A; col < bodyWidth; col += A) { LoadBodyDx(src0 + col, a[0]); LoadBodyDx(src1 + col, a[1]); LoadBodyDx(src2 + col, a[2]); SobelDxAbsSum(a, rowSum); } LoadTailDx(src0 + width - A, a[0]); LoadTailDx(src1 + width - A, a[1]); LoadTailDx(src2 + width - A, a[2]); SetMask3x3(a, tailMask); SobelDxAbsSum(a, rowSum); fullSum = PadSum(fullSum,rowSum); } *sum = ExtractSum(fullSum); }