template<bool align, bool compensation> void ReduceGray3x3( const uint8_t* src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t* dst, size_t dstWidth, size_t dstHeight, size_t dstStride) { assert(srcWidth >= A && (srcWidth + 1)/2 == dstWidth && (srcHeight + 1)/2 == dstHeight); if(align) assert(Aligned(src) && Aligned(srcStride)); size_t lastOddCol = srcWidth - AlignLo(srcWidth, 2); size_t bodyWidth = AlignLo(srcWidth, A); for(size_t row = 0; row < srcHeight; row += 2, dst += dstStride, src += 2*srcStride) { const uint8_t * s1 = src; const uint8_t * s0 = s1 - (row ? srcStride : 0); const uint8_t * s2 = s1 + (row != srcHeight - 1 ? srcStride : 0); vst1_u8(dst, ReduceRow<compensation>(ReduceColNose<align>(s0), ReduceColNose<align>(s1), ReduceColNose<align>(s2))); for(size_t srcCol = A, dstCol = HA; srcCol < bodyWidth; srcCol += A, dstCol += HA) vst1_u8(dst + dstCol, ReduceRow<compensation>(ReduceColBody<align>(s0 + srcCol), ReduceColBody<align>(s1 + srcCol), ReduceColBody<align>(s2 + srcCol))); if(bodyWidth != srcWidth) { size_t srcCol = srcWidth - A - lastOddCol; size_t dstCol = dstWidth - HA - lastOddCol; vst1_u8(dst + dstCol, ReduceRow<compensation>(ReduceColBody<false>(s0 + srcCol), ReduceColBody<false>(s1 + srcCol), ReduceColBody<false>(s2 + srcCol))); if(lastOddCol) dst[dstWidth - 1] = Base::GaussianBlur3x3<compensation>(s0 + srcWidth, s1 + srcWidth, s2 + srcWidth, -2, -1, -1); } } }
template <bool align> void AbsDifferenceSum( const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, size_t width, size_t height, uint64_t * sum) { assert(width >= A); if (align) assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)); size_t alignedWidth = AlignLo(width, QA); size_t bodyWidth = AlignLo(width, A); v128_u8 tailMask = ShiftLeft(K8_FF, A - width + bodyWidth); *sum = 0; for (size_t row = 0; row < height; ++row) { size_t col = 0; v128_u32 sums[4] = { K32_00000000, K32_00000000, K32_00000000, K32_00000000 }; for (; col < alignedWidth; col += QA) { AbsDifferenceSum<align>(a, b, col, sums[0]); AbsDifferenceSum<align>(a, b, col + A, sums[1]); AbsDifferenceSum<align>(a, b, col + 2 * A, sums[2]); AbsDifferenceSum<align>(a, b, col + 3 * A, sums[3]); } sums[0] = vec_add(vec_add(sums[0], sums[1]), vec_add(sums[2], sums[3])); for (; col < bodyWidth; col += A) AbsDifferenceSum<align>(a, b, col, sums[0]); if (width - bodyWidth) AbsDifferenceSumMasked<false>(a, b, width - A, tailMask, sums[0]); *sum += ExtractSum(sums[0]); a += aStride; b += bStride; } }
void ConditionalCount8u(const uint8_t * src, size_t stride, size_t width, size_t height, uint8_t value, uint32_t * count) { assert(width >= A); if (align) assert(Aligned(src) && Aligned(stride)); size_t alignedWidth = AlignLo(width, QA); size_t bodyWidth = AlignLo(width, A); v128_u8 tailMask = ShiftLeft(K8_01, A - width + alignedWidth); v128_u8 _value = SIMD_VEC_SET1_EPI8(value); v128_u32 counts[4] = { K32_00000000, K32_00000000, K32_00000000, K32_00000000 }; for (size_t row = 0; row < height; ++row) { size_t col = 0; for (; col < alignedWidth; col += QA) { ConditionalCount8u<align, compareType>(src, col, _value, counts[0]); ConditionalCount8u<align, compareType>(src, col + A, _value, counts[1]); ConditionalCount8u<align, compareType>(src, col + 2 * A, _value, counts[2]); ConditionalCount8u<align, compareType>(src, col + 3 * A, _value, counts[3]); } for (; col < bodyWidth; col += A) ConditionalCount8u<align, compareType>(src, col, _value, counts[0]); if (alignedWidth != width) { const v128_u8 mask = vec_and(Compare8u<compareType>(Load<false>(src + width - A), _value), tailMask); counts[0] = vec_msum(mask, K8_01, counts[0]); } src += stride; } counts[0] = vec_add(vec_add(counts[0], counts[1]), vec_add(counts[2], counts[3])); *count = ExtractSum(counts[0]); }
template <bool align> SIMD_INLINE void SquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b, size_t size, float * sum) { assert(size >= F); if (align) assert(Aligned(a) && Aligned(b)); size_t partialAlignedSize = AlignLo(size, F); size_t fullAlignedSize = AlignLo(size, DF); size_t i = 0; float32x4_t sums[2] = { vdupq_n_f32(0), vdupq_n_f32(0) }; if (fullAlignedSize) { for (; i < fullAlignedSize; i += DF) { SquaredDifferenceSum16f<align>(a, b, i + F * 0, sums[0]); SquaredDifferenceSum16f<align>(a, b, i + F * 1, sums[1]); } sums[0] = vaddq_f32(sums[0], sums[1]); } for (; i < partialAlignedSize; i += F) SquaredDifferenceSum16f<align>(a, b, i, sums[0]); if (partialAlignedSize != size) { float32x4_t tailMask = RightNotZero(size - partialAlignedSize); float32x4_t _a = vcvt_f32_f16((float16x4_t)LoadHalf<align>(a + size - F)); float32x4_t _b = vcvt_f32_f16((float16x4_t)LoadHalf<align>(a + size - F)); float32x4_t _d = And(vsubq_f32(_a, _b), tailMask); sums[0] = vaddq_f32(sums[0], vmulq_f32(_d, _d)); } *sum = ExtractSum32f(sums[0]); }
template <bool align> void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride) { assert(width >= A); if(align) assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(gray) && Aligned(grayStride)); size_t alignedWidth = AlignLo(width, A); for(size_t row = 0; row < height; ++row) { Loader<align> _bgr(bgr); Storer<align> _gray(gray); BgrToGray<align, true>(_bgr, _gray); for(size_t col = A; col < alignedWidth; col += A) BgrToGray<align, false>(_bgr, _gray); Flush(_gray); if(alignedWidth != width) { Loader<false> _bgr(bgr + 3*(width - A)); Storer<false> _gray(gray + width - A); BgrToGray<false, true>(_bgr, _gray); Flush(_gray); } bgr += bgrStride; gray += grayStride; } }
template <bool align> void EdgeBackgroundAdjustRangeMasked(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride) { assert(width >= A); if(align) { assert(Aligned(backgroundValue) && Aligned(backgroundValueStride)); assert(Aligned(backgroundCount) && Aligned(backgroundCountStride)); assert(Aligned(mask) && Aligned(maskStride)); } const __m256i _threshold = _mm256_set1_epi8((char)threshold); size_t alignedWidth = AlignLo(width, A); __m256i tailMask = SetMask<uint8_t>(0, A - width + alignedWidth, 1); for(size_t row = 0; row < height; ++row) { for(size_t col = 0; col < alignedWidth; col += A) EdgeBackgroundAdjustRangeMasked<align>(backgroundCount, backgroundValue, mask, col, _threshold, K8_01); if(alignedWidth != width) EdgeBackgroundAdjustRangeMasked<false>(backgroundCount, backgroundValue, mask, width - A, _threshold, tailMask); backgroundValue += backgroundValueStride; backgroundCount += backgroundCountStride; mask += maskStride; } }
template <bool align> void AddFeatureDifference(const uint8_t * value, size_t valueStride, size_t width, size_t height, const uint8_t * lo, size_t loStride, const uint8_t * hi, size_t hiStride, uint16_t weight, uint8_t * difference, size_t differenceStride) { assert(width >= A); if(align) { assert(Aligned(value) && Aligned(valueStride)); assert(Aligned(lo) && Aligned(loStride)); assert(Aligned(hi) && Aligned(hiStride)); assert(Aligned(difference) && Aligned(differenceStride)); } size_t alignedWidth = AlignLo(width, A); __m128i tailMask = ShiftLeft(K_INV_ZERO, A - width + alignedWidth); __m128i _weight = _mm_set1_epi16((short)weight); for(size_t row = 0; row < height; ++row) { for(size_t col = 0; col < alignedWidth; col += A) AddFeatureDifference<align>(value, lo, hi, difference, col, _weight, K_INV_ZERO); if(alignedWidth != width) AddFeatureDifference<false>(value, lo, hi, difference, width - A, _weight, tailMask); value += valueStride; lo += loStride; hi += hiStride; difference += differenceStride; } }
template<bool align> void AbsSecondDerivativeHistogram(const uint8_t *src, size_t width, size_t height, size_t stride, size_t step, size_t indent, uint32_t * histogram) { memset(histogram, 0, sizeof(uint32_t)*HISTOGRAM_SIZE); Buffer buffer(stride); buffer.p += indent; src += indent*(stride + 1); height -= 2*indent; width -= 2*indent; ptrdiff_t bodyStart = (uint8_t*)AlignHi(buffer.p, A) - buffer.p; ptrdiff_t bodyEnd = bodyStart + AlignLo(width - bodyStart, A); size_t rowStep = step*stride; for(size_t row = 0; row < height; ++row) { if(bodyStart) AbsSecondDerivative<false>(src, step, rowStep, buffer.p); for(ptrdiff_t col = bodyStart; col < bodyEnd; col += A) AbsSecondDerivative<align>(src + col, step, rowStep, buffer.p + col); if(width != (size_t)bodyEnd) AbsSecondDerivative<false>(src + width - A, step, rowStep, buffer.p + width - A); for(size_t i = 0; i < width; ++i) ++histogram[buffer.p[i]]; src += stride; } }
template <bool align> void SquaredDifferenceSum( const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, size_t width, size_t height, uint64_t * sum) { assert(width < 0x10000); if(align) { assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)); } size_t bodyWidth = AlignLo(width, A); __m128i tailMask = ShiftLeft(K_INV_ZERO, A - width + bodyWidth); __m128i fullSum = _mm_setzero_si128(); for(size_t row = 0; row < height; ++row) { __m128i rowSum = _mm_setzero_si128(); for(size_t col = 0; col < bodyWidth; col += A) { const __m128i a_ = Load<align>((__m128i*)(a + col)); const __m128i b_ = Load<align>((__m128i*)(b + col)); rowSum = _mm_add_epi32(rowSum, SquaredDifference(a_, b_)); } if(width - bodyWidth) { const __m128i a_ = _mm_and_si128(tailMask, Load<false>((__m128i*)(a + width - A))); const __m128i b_ = _mm_and_si128(tailMask, Load<false>((__m128i*)(b + width - A))); rowSum = _mm_add_epi32(rowSum, SquaredDifference(a_, b_)); } fullSum = _mm_add_epi64(fullSum, HorizontalSum32(rowSum)); a += aStride; b += bStride; } *sum = ExtractInt64Sum(fullSum); }
template <bool align> void FillBgr(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red) { size_t size = width*3; size_t step = A*3; size_t alignedSize = AlignLo(width, A)*3; uint32_t bgrb = uint32_t(blue) | (uint32_t(green) << 8) | (uint32_t(red) << 16) | (uint32_t(blue) << 24); uint32_t grbg = uint32_t(green) | (uint32_t(red) << 8) | (uint32_t(blue) << 16) | (uint32_t(green) << 24); uint32_t rbgr = uint32_t(red) | (uint32_t(blue) << 8) | (uint32_t(green) << 16) | (uint32_t(red) << 24); __m128i bgrs[3]; bgrs[0] = _mm_setr_epi32(bgrb, grbg, rbgr, bgrb); bgrs[1] = _mm_setr_epi32(grbg, rbgr, bgrb, grbg); bgrs[2] = _mm_setr_epi32(rbgr, bgrb, grbg, rbgr); for(size_t row = 0; row < height; ++row) { size_t offset = 0; for(; offset < alignedSize; offset += step) { Store<align>((__m128i*)(dst + offset) + 0, bgrs[0]); Store<align>((__m128i*)(dst + offset) + 1, bgrs[1]); Store<align>((__m128i*)(dst + offset) + 2, bgrs[2]); } if(offset < size) { offset = size - step; Store<false>((__m128i*)(dst + offset) + 0, bgrs[0]); Store<false>((__m128i*)(dst + offset) + 1, bgrs[1]); Store<false>((__m128i*)(dst + offset) + 2, bgrs[2]); } dst += stride; } }
void ConditionalCount16i(const uint8_t * src, size_t stride, size_t width, size_t height, int16_t value, uint32_t * count) { assert(width >= HA); if (align) assert(Aligned(src) && Aligned(stride)); size_t alignedWidth = AlignLo(width, DA); size_t bodyWidth = Simd::AlignLo(width, HA); v128_u16 tailMask = ShiftLeft(K16_0001, HA - width + alignedWidth); v128_s16 _value = SIMD_VEC_SET1_EPI16(value); v128_u32 counts[4] = { K32_00000000, K32_00000000, K32_00000000, K32_00000000 }; for (size_t row = 0; row < height; ++row) { const int16_t * s = (const int16_t *)src; size_t col = 0; for (; col < alignedWidth; col += DA) { ConditionalCount16i<align, compareType>(s, col, _value, counts[0]); ConditionalCount16i<align, compareType>(s, col + HA, _value, counts[1]); ConditionalCount16i<align, compareType>(s, col + 2 * HA, _value, counts[2]); ConditionalCount16i<align, compareType>(s, col + 3 * HA, _value, counts[3]); } for (; col < bodyWidth; col += HA) ConditionalCount16i<align, compareType>(s, col, _value, counts[0]); if (alignedWidth != width) { const v128_u16 mask = vec_and((v128_u16)Compare16i<compareType>(Load<false>(s + width - HA), _value), tailMask); counts[0] = vec_msum(mask, K16_0001, counts[0]); } src += stride; } counts[0] = vec_add(vec_add(counts[0], counts[1]), vec_add(counts[2], counts[3])); *count = ExtractSum(counts[0]); }
template <bool align> void StretchGray2x2( const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) { assert(srcWidth*2 == dstWidth && srcHeight*2 == dstHeight && srcWidth >= A); if(align) { assert(Aligned(src) && Aligned(srcStride)); assert(Aligned(dst) && Aligned(dstStride)); } size_t alignedWidth = AlignLo(srcWidth, A); for(size_t row = 0; row < srcHeight; ++row) { uint8_t * dstEven = dst; uint8_t * dstOdd = dst + dstStride; for(size_t srcCol = 0, dstCol = 0; srcCol < alignedWidth; srcCol += A, dstCol += DA) { __m256i value = LoadPermuted<align>((__m256i*)(src + srcCol)); StoreUnpacked<align>(value, dstEven + dstCol); StoreUnpacked<align>(value, dstOdd + dstCol); } if(alignedWidth != srcWidth) { __m256i value = LoadPermuted<false>((__m256i*)(src + srcWidth - A)); StoreUnpacked<false>(value, dstEven + dstWidth - 2*A); StoreUnpacked<false>(value, dstOdd + dstWidth - 2*A); } src += srcStride; dst += 2*dstStride; } }
template <bool align> void Yuv444pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * hue, size_t hueStride) { assert(width >= A); if(align) { assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); assert(Aligned(v) && Aligned(vStride) && Aligned(hue) && Aligned(hueStride)); } const __m128 KF_255_DIV_6 = _mm_set_ps1(Base::KF_255_DIV_6); size_t bodyWidth = AlignLo(width, A); size_t tail = width - bodyWidth; for(size_t row = 0; row < height; row += 1) { for(size_t col = 0; col < bodyWidth; col += A) { Store<align>((__m128i*)(hue + col), YuvToHue8(Load<align>((__m128i*)(y + col)), Load<align>((__m128i*)(u + col)), Load<align>((__m128i*)(v + col)), KF_255_DIV_6)); } if(tail) { size_t offset = width - A; Store<false>((__m128i*)(hue + offset), YuvToHue8(Load<false>((__m128i*)(y + offset)), Load<false>((__m128i*)(u + offset)), Load<false>((__m128i*)(v + offset)), KF_255_DIV_6)); } y += yStride; u += uStride; v += vStride; hue += hueStride; } }
template <bool align> void LbpEstimate( const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride) { assert(width >= 2); if (align) assert(Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride)); size_t alignedWidth = AlignLo(width - 2, A) + 1; __mmask64 tailMask = Aligned(width - alignedWidth); memset(dst, 0, width); src += srcStride; dst += dstStride; for (size_t row = 2; row < height; ++row) { dst[0] = 0; size_t col = 1; for (; col < alignedWidth; col += A) LbpEstimate<align, false>(src + col, srcStride, dst + col); if (col < width) LbpEstimate<align, false>(src + col, srcStride, dst + col, tailMask); dst[width - 1] = 0; src += srcStride; dst += dstStride; } memset(dst, 0, width); }
template <bool align> void BgraToYuv422p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) { assert((width%2 == 0) && (width >= DA)); if(align) { assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); assert(Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride)); } size_t alignedWidth = AlignLo(width, DA); const size_t A8 = A*8; for(size_t row = 0; row < height; ++row) { for(size_t colUV = 0, colY = 0, colBgra = 0; colY < alignedWidth; colY += DA, colUV += A, colBgra += A8) BgraToYuv422p<align>(bgra + colBgra, y + colY, u + colUV, v + colUV); if(width != alignedWidth) { size_t offset = width - DA; BgraToYuv422p<false>(bgra + offset*4, y + offset, u + offset/2, v + offset/2); } y += yStride; u += uStride; v += vStride; bgra += bgraStride; } }
void FillBgr(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red) { size_t size = width*3; size_t step = sizeof(size_t)*3; size_t alignedSize = AlignLo(width, sizeof(size_t))*3; size_t bgrs[3]; #if defined(SIMD_X64_ENABLE) || defined(SIMD_PPC64_ENABLE) bgrs[0] = Fill64(blue, green, red); bgrs[1] = Fill64(red, blue, green); bgrs[2] = Fill64(green, red, blue); #else bgrs[0] = Fill32(blue, green, red); bgrs[1] = Fill32(green, red, blue); bgrs[2] = Fill32(red, blue, green); #endif for(size_t row = 0; row < height; ++row) { size_t offset = 0; for(; offset < alignedSize; offset += step) { ((size_t*)(dst + offset))[0] = bgrs[0]; ((size_t*)(dst + offset))[1] = bgrs[1]; ((size_t*)(dst + offset))[2] = bgrs[2]; } for(; offset < size; offset += 3) { (dst + offset)[0] = blue; (dst + offset)[1] = green; (dst + offset)[2] = red; } dst += stride; } }
void FillBgra(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red, uint8_t alpha) { #ifdef SIMD_BIG_ENDIAN uint32_t bgra32 = uint32_t(alpha) | (uint32_t(red) << 8) | (uint32_t(green) << 16) | (uint32_t(blue) << 24); #else uint32_t bgra32 = uint32_t(blue) | (uint32_t(green) << 8) | (uint32_t(red) << 16) | (uint32_t(alpha) << 24); #endif #if defined(SIMD_X64_ENABLE) || defined(SIMD_PPC64_ENABLE) uint64_t bgra64 = uint64_t(bgra32) | (uint64_t(bgra32) << 32); size_t alignedWidth = AlignLo(width, 2); for(size_t row = 0; row < height; ++row) { for(size_t col = 0; col < alignedWidth; col += 2) *((uint64_t*)((uint32_t*)dst + col)) = bgra64; if(width != alignedWidth) ((uint32_t*)dst)[width - 1] = bgra32; dst += stride; } #else for(size_t row = 0; row < height; ++row) { for(size_t col = 0; col < width; ++col) ((uint32_t*)dst)[col] = bgra32; dst += stride; } #endif }
template <bool align> void Yuv444pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * bgr, size_t bgrStride) { assert(width >= A); if(align) { assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); assert(Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)); } size_t bodyWidth = AlignLo(width, A); size_t tail = width - bodyWidth; size_t A3 = A*3; for(size_t row = 0; row < height; ++row) { for(size_t colYuv = 0, colBgr = 0; colYuv < bodyWidth; colYuv += A, colBgr += A3) { Yuv444pToBgr<align>(y + colYuv, u + colYuv, v + colYuv, bgr + colBgr); } if(tail) { size_t col = width - A; Yuv444pToBgr<false>(y + col, u + col, v + col, bgr + 3*col); } y += yStride; u += uStride; v += vStride; bgr += bgrStride; } }
template <bool align> void InterleaveUv(const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * uv, size_t uvStride) { assert(width >= A); if(align) { assert(Aligned(uv) && Aligned(uvStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride)); } size_t bodyWidth = AlignLo(width, A); size_t tail = width - bodyWidth; for(size_t row = 0; row < height; ++row) { for (size_t col = 0, offset = 0; col < bodyWidth; col += A, offset += DA) InterleaveUv<align>(u + col, v + col, uv + offset); if(tail) { size_t col = width - A; size_t offset = 2*col; InterleaveUv<false>(u + col, v + col, uv + offset); } u += uStride; v += vStride; uv += uvStride; } }
template <bool align> void Yuv422pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * bgr, size_t bgrStride) { assert((width%2 == 0) && (width >= DA)); if(align) { assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); assert(Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)); } size_t bodyWidth = AlignLo(width, DA); size_t tail = width - bodyWidth; size_t A6 = A*6; for(size_t row = 0; row < height; ++row) { for(size_t colUV = 0, colY = 0, colBgr = 0; colY < bodyWidth; colY += DA, colUV += A, colBgr += A6) Yuv422pToBgr<align>(y + colY, u + colUV, v + colUV, bgr + colBgr); if(tail) { size_t offset = width - DA; Yuv422pToBgr<false>(y + offset, u + offset/2, v + offset/2, bgr + 3*offset); } y += yStride; u += uStride; v += vStride; bgr += bgrStride; } }
template <bool align> void EdgeBackgroundAdjustRangeMasked(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride) { assert(width >= A); if (align) { assert(Aligned(backgroundValue) && Aligned(backgroundValueStride)); assert(Aligned(backgroundCount) && Aligned(backgroundCountStride)); assert(Aligned(mask) && Aligned(maskStride)); } const uint8x16_t _threshold = vld1q_dup_u8(&threshold); size_t alignedWidth = AlignLo(width, A); uint8x16_t tailMask = ShiftLeft(K8_01, A - width + alignedWidth); for (size_t row = 0; row < height; ++row) { for (size_t col = 0; col < alignedWidth; col += A) EdgeBackgroundAdjustRangeMasked<align>(backgroundCount, backgroundValue, mask, col, _threshold, K8_01); if (alignedWidth != width) EdgeBackgroundAdjustRangeMasked<false>(backgroundCount, backgroundValue, mask, width - A, _threshold, tailMask); backgroundValue += backgroundValueStride; backgroundCount += backgroundCountStride; mask += maskStride; } }
void SvmSumLinear(const float * x, const float * svs, const float * weights, size_t length, size_t count, float * sum) { Buffer buffer(count); size_t alignedCount = AlignLo(count, 4); for(size_t j = 0; j < length; ++j) { size_t i = 0; float v = x[j]; __m128 _v = _mm_set1_ps(v); for(; i < alignedCount; i += 4) { __m128 sums = Load<true>(buffer.sums + i); __m128 _svs = Load<false>(svs + i); Store<true>(buffer.sums + i, _mm_add_ps(sums, _mm_mul_ps(_v, _svs))); } for(; i < count; ++i) buffer.sums[i] += v*svs[i]; svs += count; } size_t i = 0; __m128 _sum = _mm_setzero_ps(); for(; i < alignedCount; i += 4) { __m128 sums = Load<true>(buffer.sums + i); __m128 _weights = Load<false>(weights + i); _sum = _mm_add_ps(_sum, _mm_mul_ps(sums, _weights)); } *sum = ExtractSum(_sum); for(; i < count; ++i) *sum += buffer.sums[i]*weights[i]; }
template <bool align> void StretchGray2x2( const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) { assert(srcWidth*2 == dstWidth && srcHeight*2 == dstHeight && srcWidth >= A); if(align) { assert(Aligned(src) && Aligned(srcStride)); assert(Aligned(dst) && Aligned(dstStride)); } size_t alignedWidth = AlignLo(srcWidth, A); for(size_t row = 0; row < srcHeight; ++row) { Storer<align> even(dst), odd(dst + dstStride); StretchGray2x2<align, true>(src, even, odd); for(size_t col = A; col < alignedWidth; col += A) StretchGray2x2<align, false>(src + col, even, odd); Flush(even, odd); if(alignedWidth != srcWidth) { Storer<false> even(dst + dstWidth - 2*A), odd(dst + dstStride + dstWidth - 2*A); StretchGray2x2<false, true>(src + srcWidth - A, even, odd); Flush(even, odd); } src += srcStride; dst += 2*dstStride; } }
template <bool align> void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height, const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) { assert(width >= HA); if(align) { assert(Aligned(blue) && Aligned(blueStride)); assert(Aligned(green) && Aligned(greenStride)); assert(Aligned(red) && Aligned(redStride)); assert(Aligned(bgra) && Aligned(bgraStride)); } v128_u8 _alpha = SetU8(alpha); size_t alignedWidth = AlignLo(width, HA); for(size_t row = 0; row < height; ++row) { Storer<align> _bgra(bgra); Bgr48pToBgra32<align, true>(blue, green, red, 0, _alpha, _bgra); for(size_t col = HA; col < alignedWidth; col += HA) Bgr48pToBgra32<align, false>(blue, green, red, col*2, _alpha, _bgra); Flush(_bgra); if(width != alignedWidth) { Storer<false> _bgra(bgra + (width - HA)*4); Bgr48pToBgra32<false, true>(blue, green, red, (width - HA)*2, _alpha, _bgra); Flush(_bgra); } blue += blueStride; green += greenStride; red += redStride; bgra += bgraStride; } }
template <bool align> void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride) { assert(width >= A); if(align) assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)); size_t alignedWidth = AlignLo(width, A); if(width == alignedWidth) alignedWidth -= A; __m128i k[3][2]; k[0][0] = _mm_setr_epi8(0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1); k[0][1] = _mm_setr_epi8( -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x1, 0x2, 0x4); k[1][0] = _mm_setr_epi8(0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1, -1, -1, -1, -1); k[1][1] = _mm_setr_epi8( -1, -1, -1, -1, -1, -1, -1, -1, 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9); k[2][0] = _mm_setr_epi8(0xA, 0xC, 0xD, 0xE, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); k[2][1] = _mm_setr_epi8( -1, -1, -1, -1, 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE); for(size_t row = 0; row < height; ++row) { for(size_t col = 0; col < alignedWidth; col += A) BgraToBgrBody<align>(bgra + 4*col, bgr + 3*col, k); if(width != alignedWidth) BgraToBgr<false>(bgra + 4*(width - A), bgr + 3*(width - A), k); bgra += bgraStride; bgr += bgrStride; } }
template <bool align> void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) { assert(width >= A); if(align) assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride)); size_t alignedWidth = AlignLo(width, A); if(width == alignedWidth) alignedWidth -= A; const v128_u8 _alpha = SetU8(alpha); for(size_t row = 0; row < height; ++row) { Loader<align> _bgr(bgr); Storer<align> _bgra(bgra); BgrToBgra<align, true>(_bgr, _alpha, _bgra); for(size_t col = A; col < alignedWidth; col += A) BgrToBgra<align, false>(_bgr, _alpha, _bgra); Flush(_bgra); if(width != alignedWidth) { Loader<false> _bgr(bgr + 3*(width - A)); Storer<false> _bgra(bgra + 4*(width - A)); BgrToBgra<false, true>(_bgr, _alpha, _bgra); Flush(_bgra); } bgra += bgraStride; bgr += bgrStride; } }
template <bool align> void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height, const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha) { assert(width >= HA); if(align) { assert(Aligned(blue) && Aligned(blueStride)); assert(Aligned(green) && Aligned(greenStride)); assert(Aligned(red) && Aligned(redStride)); assert(Aligned(bgra) && Aligned(bgraStride)); } __m128i _alpha = _mm_slli_si128(_mm_set1_epi16(alpha), 1); size_t alignedWidth = AlignLo(width, HA); for(size_t row = 0; row < height; ++row) { for(size_t col = 0, srcOffset = 0, dstOffset = 0; col < alignedWidth; col += HA, srcOffset += A, dstOffset += DA) Bgr48pToBgra32<align>(bgra + dstOffset, blue, green, red, srcOffset, _alpha); if(width != alignedWidth) Bgr48pToBgra32<false>(bgra + (width - HA)*4, blue, green, red, (width - HA)*2, _alpha); blue += blueStride; green += greenStride; red += redStride; bgra += bgraStride; } }
template <bool align> void BgrToYuv444p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) { assert(width >= A); if(align) { assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); assert(Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)); } size_t alignedWidth = AlignLo(width, A); const size_t A3 = A*3; for(size_t row = 0; row < height; ++row) { for(size_t col = 0, colBgr = 0; col < alignedWidth; col += A, colBgr += A3) BgrToYuv444p<align>(bgr + colBgr, y + col, u + col, v + col); if(width != alignedWidth) { size_t col = width - A; BgrToYuv444p<false>(bgr + col*3, y + col, u + col, v + col); } y += yStride; u += uStride; v += vStride; bgr += bgrStride; } }
template <bool align> void DeinterleaveUv(const uint8_t * uv, size_t uvStride, size_t width, size_t height, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) { assert(width >= A); if(align) assert(Aligned(uv) && Aligned(uvStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride)); size_t alignedWidth = AlignLo(width, A); for(size_t row = 0; row < height; ++row) { Loader<align> _uv(uv); Storer<align> _u(u), _v(v); DeinterleavedUv<align, true>(_uv, _u, _v); for(size_t col = A; col < alignedWidth; col += A) DeinterleavedUv<align, false>(_uv, _u, _v); Flush(_u, _v); if(width != alignedWidth) { Loader<false> _uv(uv + 2*(width - A)); Storer<false> _u(u + width - A), _v(v + width - A); DeinterleavedUv<false, true>(_uv, _u, _v); Flush(_u, _v); } uv += uvStride; u += uStride; v += vStride; } }
template <bool align> void BgrToYuv420p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride, uint8_t * u, size_t uStride, uint8_t * v, size_t vStride) { assert((width%2 == 0) && (height%2 == 0) && (width >= DA) && (height >= 2)); if(align) { assert(Aligned(y) && Aligned(yStride) && Aligned(u) && Aligned(uStride)); assert(Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride)); } size_t alignedWidth = AlignLo(width, DA); const size_t A6 = A*6; for(size_t row = 0; row < height; row += 2) { for(size_t colUV = 0, colY = 0, colBgr = 0; colY < alignedWidth; colY += DA, colUV += A, colBgr += A6) BgrToYuv420p<align>(bgr + colBgr, bgrStride, y + colY, yStride, u + colUV, v + colUV); if(width != alignedWidth) { size_t offset = width - DA; BgrToYuv420p<false>(bgr + offset*3, bgrStride, y + offset, yStride, u + offset/2, v + offset/2); } y += 2*yStride; u += uStride; v += vStride; bgr += 2*bgrStride; } }