void ConditionalCount16i(const uint8_t * src, size_t stride, size_t width, size_t height, int16_t value, uint32_t * count) { assert(width >= HA); if (align) assert(Aligned(src) && Aligned(stride)); size_t alignedWidth = AlignLo(width, DA); size_t bodyWidth = Simd::AlignLo(width, HA); v128_u16 tailMask = ShiftLeft(K16_0001, HA - width + alignedWidth); v128_s16 _value = SIMD_VEC_SET1_EPI16(value); v128_u32 counts[4] = { K32_00000000, K32_00000000, K32_00000000, K32_00000000 }; for (size_t row = 0; row < height; ++row) { const int16_t * s = (const int16_t *)src; size_t col = 0; for (; col < alignedWidth; col += DA) { ConditionalCount16i<align, compareType>(s, col, _value, counts[0]); ConditionalCount16i<align, compareType>(s, col + HA, _value, counts[1]); ConditionalCount16i<align, compareType>(s, col + 2 * HA, _value, counts[2]); ConditionalCount16i<align, compareType>(s, col + 3 * HA, _value, counts[3]); } for (; col < bodyWidth; col += HA) ConditionalCount16i<align, compareType>(s, col, _value, counts[0]); if (alignedWidth != width) { const v128_u16 mask = vec_and((v128_u16)Compare16i<compareType>(Load<false>(s + width - HA), _value), tailMask); counts[0] = vec_msum(mask, K16_0001, counts[0]); } src += stride; } counts[0] = vec_add(vec_add(counts[0], counts[1]), vec_add(counts[2], counts[3])); *count = ExtractSum(counts[0]); }
void SvmSumLinear(const float * x, const float * svs, const float * weights, size_t length, size_t count, float * sum) { Buffer buffer(count); size_t alignedCount = AlignLo(count, 4); for(size_t j = 0; j < length; ++j) { size_t i = 0; float v = x[j]; __m128 _v = _mm_set1_ps(v); for(; i < alignedCount; i += 4) { __m128 sums = Load<true>(buffer.sums + i); __m128 _svs = Load<false>(svs + i); Store<true>(buffer.sums + i, _mm_add_ps(sums, _mm_mul_ps(_v, _svs))); } for(; i < count; ++i) buffer.sums[i] += v*svs[i]; svs += count; } size_t i = 0; __m128 _sum = _mm_setzero_ps(); for(; i < alignedCount; i += 4) { __m128 sums = Load<true>(buffer.sums + i); __m128 _weights = Load<false>(weights + i); _sum = _mm_add_ps(_sum, _mm_mul_ps(sums, _weights)); } *sum = ExtractSum(_sum); for(; i < count; ++i) *sum += buffer.sums[i]*weights[i]; }
void ConditionalCount8u(const uint8_t * src, size_t stride, size_t width, size_t height, uint8_t value, uint32_t * count) { assert(width >= A); if (align) assert(Aligned(src) && Aligned(stride)); size_t alignedWidth = AlignLo(width, QA); size_t bodyWidth = AlignLo(width, A); v128_u8 tailMask = ShiftLeft(K8_01, A - width + alignedWidth); v128_u8 _value = SIMD_VEC_SET1_EPI8(value); v128_u32 counts[4] = { K32_00000000, K32_00000000, K32_00000000, K32_00000000 }; for (size_t row = 0; row < height; ++row) { size_t col = 0; for (; col < alignedWidth; col += QA) { ConditionalCount8u<align, compareType>(src, col, _value, counts[0]); ConditionalCount8u<align, compareType>(src, col + A, _value, counts[1]); ConditionalCount8u<align, compareType>(src, col + 2 * A, _value, counts[2]); ConditionalCount8u<align, compareType>(src, col + 3 * A, _value, counts[3]); } for (; col < bodyWidth; col += A) ConditionalCount8u<align, compareType>(src, col, _value, counts[0]); if (alignedWidth != width) { const v128_u8 mask = vec_and(Compare8u<compareType>(Load<false>(src + width - A), _value), tailMask); counts[0] = vec_msum(mask, K8_01, counts[0]); } src += stride; } counts[0] = vec_add(vec_add(counts[0], counts[1]), vec_add(counts[2], counts[3])); *count = ExtractSum(counts[0]); }
template <bool align> void AbsDifferenceSum( const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, size_t width, size_t height, uint64_t * sum) { assert(width >= A); if (align) assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)); size_t alignedWidth = AlignLo(width, QA); size_t bodyWidth = AlignLo(width, A); v128_u8 tailMask = ShiftLeft(K8_FF, A - width + bodyWidth); *sum = 0; for (size_t row = 0; row < height; ++row) { size_t col = 0; v128_u32 sums[4] = { K32_00000000, K32_00000000, K32_00000000, K32_00000000 }; for (; col < alignedWidth; col += QA) { AbsDifferenceSum<align>(a, b, col, sums[0]); AbsDifferenceSum<align>(a, b, col + A, sums[1]); AbsDifferenceSum<align>(a, b, col + 2 * A, sums[2]); AbsDifferenceSum<align>(a, b, col + 3 * A, sums[3]); } sums[0] = vec_add(vec_add(sums[0], sums[1]), vec_add(sums[2], sums[3])); for (; col < bodyWidth; col += A) AbsDifferenceSum<align>(a, b, col, sums[0]); if (width - bodyWidth) AbsDifferenceSumMasked<false>(a, b, width - A, tailMask, sums[0]); *sum += ExtractSum(sums[0]); a += aStride; b += bStride; } }
void ConditionalSquareGradientSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum) { assert(width >= A + 2 && height >= 3); if (align) assert(Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride)); src += srcStride; mask += maskStride; height -= 2; size_t bodyWidth = Simd::AlignLo(width - 1, A); v128_u8 noseMask = ShiftRight(K8_FF, 1); v128_u8 tailMask = ShiftLeft(K8_FF, A - width + 1 + bodyWidth); size_t alignedWidth = Simd::AlignLo(bodyWidth - A, DA); v128_u8 _value = SetU8(value); *sum = 0; for (size_t row = 0; row < height; ++row) { v128_u32 sums[4] = { K32_00000000, K32_00000000, K32_00000000, K32_00000000 }; { const v128_u8 _mask = vec_and(Compare8u<compareType>(Load<false>(mask + 1), _value), noseMask); AddSquareDifference<false>(src + 1, 1, _mask, sums[0]); AddSquareDifference<false>(src + 1, srcStride, _mask, sums[1]); } size_t col = A; for (; col < alignedWidth; col += DA) { ConditionalSquareGradientSum<align, compareType>(src, srcStride, mask, col, _value, sums); ConditionalSquareGradientSum<align, compareType>(src, srcStride, mask, col + A, _value, sums + 2); } for (; col < bodyWidth; col += A) ConditionalSquareGradientSum<align, compareType>(src, srcStride, mask, col, _value, sums); if (bodyWidth != width - 1) { size_t offset = width - A - 1; const v128_u8 _mask = vec_and(Compare8u<compareType>(Load<false>(mask + offset), _value), tailMask); AddSquareDifference<false>(src + offset, 1, _mask, sums[0]); AddSquareDifference<false>(src + offset, srcStride, _mask, sums[1]); } sums[0] = vec_add(vec_add(sums[0], sums[1]), vec_add(sums[2], sums[3])); *sum += ExtractSum(sums[0]); src += srcStride; mask += maskStride; } }
template <bool align> void AbsDifferenceSums3x3Masked(const uint8_t *current, size_t currentStride, const uint8_t *background, size_t backgroundStride, const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sums) { assert(height > 2 && width >= A + 2); if (align) assert(Aligned(background) && Aligned(backgroundStride)); width -= 2; height -= 2; current += 1 + currentStride; background += 1 + backgroundStride; mask += 1 + maskStride; size_t bodyWidth = AlignLo(width, A); v128_u8 tailMask = ShiftLeft(K8_FF, A - width + bodyWidth); v128_u8 _index = SetU8(index); for (size_t i = 0; i < 9; ++i) sums[i] = 0; for (size_t row = 0; row < height; ++row) { v128_u32 _sums[9]; for (size_t i = 0; i < 9; ++i) _sums[i] = K32_00000000; for (size_t col = 0; col < bodyWidth; col += A) { const v128_u8 _mask = LoadMaskU8<false>(mask + col, _index); const v128_u8 _current = vec_and(Load<false>(current + col), _mask); AbsDifferenceSums3x3Masked<align>(_current, background + col, backgroundStride, _mask, _sums); } if (width - bodyWidth) { const v128_u8 _mask = vec_and(LoadMaskU8<false>(mask + width - A, _index), tailMask); const v128_u8 _current = vec_and(Load<false>(current + width - A), _mask); AbsDifferenceSums3x3Masked<false>(_current, background + width - A, backgroundStride, _mask, _sums); } for (size_t i = 0; i < 9; ++i) sums[i] += ExtractSum(_sums[i]); current += currentStride; background += backgroundStride; mask += maskStride; } }
template <bool align> void LaplaceAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) { assert(width > A); if(align) assert(Aligned(src) && Aligned(stride)); size_t bodyWidth = Simd::AlignHi(width, A) - A; const uint8_t *src0, *src1, *src2; v128_u8 a[3][3]; v128_u8 tailMask = ShiftLeft(K8_FF, A - width + bodyWidth); *sum = 0; for(size_t row = 0; row < height; ++row) { src0 = src + stride*(row - 1); src1 = src0 + stride; src2 = src1 + stride; if(row == 0) src0 = src1; if(row == height - 1) src2 = src1; v128_u32 sums[2] = {K32_00000000, K32_00000000}; LoadNose3<align, 1>(src0 + 0, a[0]); LoadNose3<align, 1>(src1 + 0, a[1]); LoadNose3<align, 1>(src2 + 0, a[2]); LaplaceAbsSum(a, sums); for(size_t col = A; col < bodyWidth; col += A) { LoadBody3<align, 1>(src0 + col, a[0]); LoadBody3<align, 1>(src1 + col, a[1]); LoadBody3<align, 1>(src2 + col, a[2]); LaplaceAbsSum(a, sums); } LoadTail3<false, 1>(src0 + width - A, a[0]); LoadTail3<false, 1>(src1 + width - A, a[1]); LoadTail3<false, 1>(src2 + width - A, a[2]); SetMask3x3(a, tailMask); LaplaceAbsSum(a, sums); *sum += ExtractSum(vec_add(sums[0], sums[1])); } }
template <bool align> void AbsDifferenceSums3x3(const uint8_t * current, size_t currentStride, const uint8_t * background, size_t backgroundStride, size_t width, size_t height, uint64_t * sums) { assert(height > 2 && width >= A + 2); if (align) assert(Aligned(background) && Aligned(backgroundStride)); width -= 2; height -= 2; current += 1 + currentStride; background += 1 + backgroundStride; size_t alignedWidth = AlignLo(width, DA); size_t bodyWidth = AlignLo(width, A); v128_u8 tailMask = ShiftLeft(K8_FF, A - width + bodyWidth); memset(sums, 0, 9 * sizeof(uint64_t)); for (size_t row = 0; row < height; ++row) { v128_u32 _sums[2][9]; memset(_sums, 0, 18 * sizeof(v128_u32)); size_t col = 0; for (; col < alignedWidth; col += DA) { AbsDifferenceSums3x3<align>(Load<false>(current + col), background + col, backgroundStride, _sums[0]); AbsDifferenceSums3x3<align>(Load<false>(current + col + A), background + col + A, backgroundStride, _sums[0]); } for (; col < bodyWidth; col += A) AbsDifferenceSums3x3<align>(Load<false>(current + col), background + col, backgroundStride, _sums[0]); if (width - bodyWidth) { const v128_u8 _current = vec_and(tailMask, Load<false>(current + width - A)); AbsDifferenceSums3x3Masked<false>(_current, background + width - A, backgroundStride, tailMask, _sums[0]); } for (size_t i = 0; i < 9; ++i) sums[i] += ExtractSum(vec_add(_sums[0][i], _sums[1][i])); current += currentStride; background += backgroundStride; } }
void SobelDxAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum) { assert(width > A); size_t bodyWidth = Simd::AlignHi(width, A) - A; const uint8_t *src0, *src1, *src2; v16u8 a[3][3]; v2u64 fullSum = Zero<v2u64>(); const v16u8 K8_FF = Fill((uint8_t)0xff); v16u8 tailMask = ShiftLeft(K8_FF, A - width + bodyWidth); for (size_t row = 0; row < height; ++row) { src0 = src + stride*(row - 1); src1 = src0 + stride; src2 = src1 + stride; if (row == 0) src0 = src1; if (row == height - 1) src2 = src1; v4u32 rowSum = Zero<v4u32>(); LoadNoseDx(src0 + 0, a[0]); LoadNoseDx(src1 + 0, a[1]); LoadNoseDx(src2 + 0, a[2]); SobelDxAbsSum(a, rowSum); for (size_t col = A; col < bodyWidth; col += A) { LoadBodyDx(src0 + col, a[0]); LoadBodyDx(src1 + col, a[1]); LoadBodyDx(src2 + col, a[2]); SobelDxAbsSum(a, rowSum); } LoadTailDx(src0 + width - A, a[0]); LoadTailDx(src1 + width - A, a[1]); LoadTailDx(src2 + width - A, a[2]); SetMask3x3(a, tailMask); SobelDxAbsSum(a, rowSum); fullSum = PadSum(fullSum,rowSum); } *sum = ExtractSum(fullSum); }
void ConditionalSquareSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum) { assert(width >= A); if (align) assert(Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride)); size_t alignedWidth = AlignLo(width, QA); size_t bodyWidth = AlignLo(width, A); v128_u8 tailMask = ShiftLeft(K8_FF, A - width + alignedWidth); v128_u8 _value = SetU8(value); *sum = 0; for (size_t row = 0; row < height; ++row) { size_t col = 0; v128_u32 sums[4] = { K32_00000000, K32_00000000, K32_00000000, K32_00000000 }; for (; col < alignedWidth; col += QA) { ConditionalSquareSum<align, compareType>(src, mask, col, _value, sums[0]); ConditionalSquareSum<align, compareType>(src, mask, col + A, _value, sums[1]); ConditionalSquareSum<align, compareType>(src, mask, col + 2 * A, _value, sums[2]); ConditionalSquareSum<align, compareType>(src, mask, col + 3 * A, _value, sums[3]); } sums[0] = vec_add(vec_add(sums[0], sums[1]), vec_add(sums[2], sums[3])); for (; col < bodyWidth; col += A) ConditionalSquareSum<align, compareType>(src, mask, col, _value, sums[0]); if (alignedWidth != width) { const v128_u8 _mask = Compare8u<compareType>(Load<false>(mask + width - A), _value); const v128_u8 _src = vec_and(vec_and(Load<false>(src + width - A), _mask), tailMask); sums[0] = vec_msum(_src, _src, sums[0]); } *sum += ExtractSum(sums[0]); src += srcStride; mask += maskStride; } }