template <bool align> void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height, const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) { assert(width >= HA); if(align) { assert(Aligned(blue) && Aligned(blueStride)); assert(Aligned(green) && Aligned(greenStride)); assert(Aligned(red) && Aligned(redStride)); assert(Aligned(bgra) && Aligned(bgraStride)); } v128_u8 _alpha = SetU8(alpha); size_t alignedWidth = AlignLo(width, HA); for(size_t row = 0; row < height; ++row) { Storer<align> _bgra(bgra); Bgr48pToBgra32<align, true>(blue, green, red, 0, _alpha, _bgra); for(size_t col = HA; col < alignedWidth; col += HA) Bgr48pToBgra32<align, false>(blue, green, red, col*2, _alpha, _bgra); Flush(_bgra); if(width != alignedWidth) { Storer<false> _bgra(bgra + (width - HA)*4); Bgr48pToBgra32<false, true>(blue, green, red, (width - HA)*2, _alpha, _bgra); Flush(_bgra); } blue += blueStride; green += greenStride; red += redStride; bgra += bgraStride; } }
template <bool align, bool increment> void InterferenceChangeMasked(int16_t * statistic, size_t statisticStride, size_t width, size_t height, uint8_t value, int16_t saturation, const uint8_t * mask, size_t maskStride, uint8_t index) { assert(width >= A); if(align) assert(Aligned(statistic) && Aligned(statisticStride, HA) && Aligned(mask) && Aligned(maskStride)); size_t alignedWidth = Simd::AlignLo(width, A); v128_u8 tailMask = ShiftLeft(K8_FF, A - width + alignedWidth); v128_s16 _value = SetI16(value); v128_s16 _saturation = SetI16(saturation); v128_u8 _index = SetU8(index); for(size_t row = 0; row < height; ++row) { Loader<align> statisticSrc(statistic), maskSrc(mask); Storer<align> statisticDst(statistic); InterferenceChangeMasked<align, true, increment>(statisticSrc, _value, _saturation, maskSrc, _index, K8_FF, statisticDst); for(size_t col = A; col < alignedWidth; col += A) InterferenceChangeMasked<align, false, increment>(statisticSrc, _value, _saturation, maskSrc, _index, K8_FF, statisticDst); Flush(statisticDst); if(alignedWidth != width) { Loader<false> statisticSrc(statistic + width - A), maskSrc(mask + width - A); Storer<false> statisticDst(statistic + width - A); InterferenceChangeMasked<false, true, increment>(statisticSrc, _value, _saturation, maskSrc, _index, tailMask, statisticDst); Flush(statisticDst); } statistic += statisticStride; mask += maskStride; } }
template <bool align> void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) { assert(width >= A); if(align) assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)); size_t alignedWidth = AlignLo(width, A); if(width == alignedWidth) alignedWidth -= A; const v128_u8 _alpha = SetU8(alpha); for(size_t row = 0; row < height; ++row) { Loader<align> _bgr(bgr); Storer<align> _bgra(bgra); BgrToBgra<align, true>(_bgr, _alpha, _bgra); for(size_t col = A; col < alignedWidth; col += A) BgrToBgra<align, false>(_bgr, _alpha, _bgra); Flush(_bgra); if(width != alignedWidth) { Loader<false> _bgr(bgr + 3*(width - A)); Storer<false> _bgra(bgra + 4*(width - A)); BgrToBgra<false, true>(_bgr, _alpha, _bgra); Flush(_bgra); } bgra += bgraStride; bgr += bgrStride; } }
void ConditionalFill(const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t threshold, uint8_t value, uint8_t * dst, size_t dstStride) { assert(width >= A); if (align) assert(Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)); size_t alignedWidth = Simd::AlignLo(width, A); v128_u8 _value = SetU8(value); v128_u8 _threshold = SetU8(threshold); for (size_t row = 0; row < height; ++row) { ConditionalFill<compareType, align>(src, 0, _threshold, _value, dst); for (size_t col = A; col < alignedWidth; col += A) ConditionalFill<compareType, true>(src, col, _threshold, _value, dst); if (!align) ConditionalFill<compareType, false>(src, alignedWidth - A, _threshold, _value, dst); if (alignedWidth != width) ConditionalFill<compareType, false>(src, width - A, _threshold, _value, dst); src += srcStride; dst += dstStride; } }
void ConditionalSquareGradientSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum) { assert(width >= A + 2 && height >= 3); if (align) assert(Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride)); src += srcStride; mask += maskStride; height -= 2; size_t bodyWidth = Simd::AlignLo(width - 1, A); v128_u8 noseMask = ShiftRight(K8_FF, 1); v128_u8 tailMask = ShiftLeft(K8_FF, A - width + 1 + bodyWidth); size_t alignedWidth = Simd::AlignLo(bodyWidth - A, DA); v128_u8 _value = SetU8(value); *sum = 0; for (size_t row = 0; row < height; ++row) { v128_u32 sums[4] = { K32_00000000, K32_00000000, K32_00000000, K32_00000000 }; { const v128_u8 _mask = vec_and(Compare8u<compareType>(Load<false>(mask + 1), _value), noseMask); AddSquareDifference<false>(src + 1, 1, _mask, sums[0]); AddSquareDifference<false>(src + 1, srcStride, _mask, sums[1]); } size_t col = A; for (; col < alignedWidth; col += DA) { ConditionalSquareGradientSum<align, compareType>(src, srcStride, mask, col, _value, sums); ConditionalSquareGradientSum<align, compareType>(src, srcStride, mask, col + A, _value, sums + 2); } for (; col < bodyWidth; col += A) ConditionalSquareGradientSum<align, compareType>(src, srcStride, mask, col, _value, sums); if (bodyWidth != width - 1) { size_t offset = width - A - 1; const v128_u8 _mask = vec_and(Compare8u<compareType>(Load<false>(mask + offset), _value), tailMask); AddSquareDifference<false>(src + offset, 1, _mask, sums[0]); AddSquareDifference<false>(src + offset, srcStride, _mask, sums[1]); } sums[0] = vec_add(vec_add(sums[0], sums[1]), vec_add(sums[2], sums[3])); *sum += ExtractSum(sums[0]); src += srcStride; mask += maskStride; } }
template <bool align> void AbsDifferenceSums3x3Masked(const uint8_t *current, size_t currentStride, const uint8_t *background, size_t backgroundStride, const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sums) { assert(height > 2 && width >= A + 2); if (align) assert(Aligned(background) && Aligned(backgroundStride)); width -= 2; height -= 2; current += 1 + currentStride; background += 1 + backgroundStride; mask += 1 + maskStride; size_t bodyWidth = AlignLo(width, A); v128_u8 tailMask = ShiftLeft(K8_FF, A - width + bodyWidth); v128_u8 _index = SetU8(index); for (size_t i = 0; i < 9; ++i) sums[i] = 0; for (size_t row = 0; row < height; ++row) { v128_u32 _sums[9]; for (size_t i = 0; i < 9; ++i) _sums[i] = K32_00000000; for (size_t col = 0; col < bodyWidth; col += A) { const v128_u8 _mask = LoadMaskU8<false>(mask + col, _index); const v128_u8 _current = vec_and(Load<false>(current + col), _mask); AbsDifferenceSums3x3Masked<align>(_current, background + col, backgroundStride, _mask, _sums); } if (width - bodyWidth) { const v128_u8 _mask = vec_and(LoadMaskU8<false>(mask + width - A, _index), tailMask); const v128_u8 _current = vec_and(Load<false>(current + width - A), _mask); AbsDifferenceSums3x3Masked<false>(_current, background + width - A, backgroundStride, _mask, _sums); } for (size_t i = 0; i < 9; ++i) sums[i] += ExtractSum(_sums[i]); current += currentStride; background += backgroundStride; mask += maskStride; } }
template <bool align> void AbsDifferenceSumMasked( const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum) { assert(width >= A); if (align) { assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)); assert(Aligned(mask) && Aligned(maskStride)); } size_t alignedWidth = AlignLo(width, QA); size_t bodyWidth = AlignLo(width, A); v128_u8 tailMask = ShiftLeft(K8_FF, A - width + bodyWidth); v128_u8 _index = SetU8(index); *sum = 0; for (size_t row = 0; row < height; ++row) { size_t col = 0; v128_u32 sums[4] = { K32_00000000, K32_00000000, K32_00000000, K32_00000000 }; for (; col < alignedWidth; col += QA) { AbsDifferenceSumMasked<align>(a, b, mask, col, _index, sums[0]); AbsDifferenceSumMasked<align>(a, b, mask, col + A, _index, sums[1]); AbsDifferenceSumMasked<align>(a, b, mask, col + 2 * A, _index, sums[2]); AbsDifferenceSumMasked<align>(a, b, mask, col + 3 * A, _index, sums[3]); } sums[0] = vec_add(vec_add(sums[0], sums[1]), vec_add(sums[2], sums[3])); for (; col < bodyWidth; col += A) AbsDifferenceSumMasked<align>(a, b, mask, col, _index, sums[0]); if (width - bodyWidth) { const v128_u8 _mask = vec_and(tailMask, LoadMaskU8<false>(mask + width - A, _index)); AbsDifferenceSumMasked<false>(a, b, width - A, _mask, sums[0]); } *sum += ExtractSum(sums[0]); a += aStride; b += bStride; mask += maskStride; } }
void ConditionalSquareSum(const uint8_t * src, size_t srcStride, size_t width, size_t height, const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum) { assert(width >= A); if (align) assert(Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride)); size_t alignedWidth = AlignLo(width, QA); size_t bodyWidth = AlignLo(width, A); v128_u8 tailMask = ShiftLeft(K8_FF, A - width + alignedWidth); v128_u8 _value = SetU8(value); *sum = 0; for (size_t row = 0; row < height; ++row) { size_t col = 0; v128_u32 sums[4] = { K32_00000000, K32_00000000, K32_00000000, K32_00000000 }; for (; col < alignedWidth; col += QA) { ConditionalSquareSum<align, compareType>(src, mask, col, _value, sums[0]); ConditionalSquareSum<align, compareType>(src, mask, col + A, _value, sums[1]); ConditionalSquareSum<align, compareType>(src, mask, col + 2 * A, _value, sums[2]); ConditionalSquareSum<align, compareType>(src, mask, col + 3 * A, _value, sums[3]); } sums[0] = vec_add(vec_add(sums[0], sums[1]), vec_add(sums[2], sums[3])); for (; col < bodyWidth; col += A) ConditionalSquareSum<align, compareType>(src, mask, col, _value, sums[0]); if (alignedWidth != width) { const v128_u8 _mask = Compare8u<compareType>(Load<false>(mask + width - A), _value); const v128_u8 _src = vec_and(vec_and(Load<false>(src + width - A), _mask), tailMask); sums[0] = vec_msum(_src, _src, sums[0]); } *sum += ExtractSum(sums[0]); src += srcStride; mask += maskStride; } }
bool CObject::SetU8(const char* name, _U32 index, _U8 val) { return SetU8(Zion::StringFormat("%s[%d]", name, index).c_str(), val); }