template<bool align, bool compensation> void ReduceGray3x3(
            const uint8_t* src, size_t srcWidth, size_t srcHeight, size_t srcStride,
            uint8_t* dst, size_t dstWidth, size_t dstHeight, size_t dstStride)	
        {
            assert(srcWidth >= A && (srcWidth + 1)/2 == dstWidth && (srcHeight + 1)/2 == dstHeight);
            if(align)
                assert(Aligned(src) && Aligned(srcStride));

            size_t lastOddCol = srcWidth - AlignLo(srcWidth, 2);
            size_t bodyWidth = AlignLo(srcWidth, A);
            for(size_t row = 0; row < srcHeight; row += 2, dst += dstStride, src += 2*srcStride)
            {
                const uint8_t * s1 = src;
                const uint8_t * s0 = s1 - (row ? srcStride : 0);
                const uint8_t * s2 = s1 + (row != srcHeight - 1 ? srcStride : 0);

				vst1_u8(dst, ReduceRow<compensation>(ReduceColNose<align>(s0),
                    ReduceColNose<align>(s1), ReduceColNose<align>(s2)));

                for(size_t srcCol = A, dstCol = HA; srcCol < bodyWidth; srcCol += A, dstCol += HA)
					vst1_u8(dst + dstCol, ReduceRow<compensation>(ReduceColBody<align>(s0 + srcCol),
                    ReduceColBody<align>(s1 + srcCol), ReduceColBody<align>(s2 + srcCol)));

                if(bodyWidth != srcWidth)
                {
                    size_t srcCol = srcWidth - A - lastOddCol;
                    size_t dstCol = dstWidth - HA - lastOddCol;
					vst1_u8(dst + dstCol, ReduceRow<compensation>(ReduceColBody<false>(s0 + srcCol),
                        ReduceColBody<false>(s1 + srcCol), ReduceColBody<false>(s2 + srcCol)));
                    if(lastOddCol)
                        dst[dstWidth - 1] = Base::GaussianBlur3x3<compensation>(s0 + srcWidth, s1 + srcWidth, s2 + srcWidth, -2, -1, -1);
                }
            }
        }
        template <bool align> void AbsDifferenceSum(
            const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride,
            size_t width, size_t height, uint64_t * sum)
        {
            assert(width >= A);
            if (align)
                assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride));

            size_t alignedWidth = AlignLo(width, QA);
            size_t bodyWidth = AlignLo(width, A);
            v128_u8 tailMask = ShiftLeft(K8_FF, A - width + bodyWidth);
            *sum = 0;
            for (size_t row = 0; row < height; ++row)
            {
                size_t col = 0;
                v128_u32 sums[4] = { K32_00000000, K32_00000000, K32_00000000, K32_00000000 };
                for (; col < alignedWidth; col += QA)
                {
                    AbsDifferenceSum<align>(a, b, col, sums[0]);
                    AbsDifferenceSum<align>(a, b, col + A, sums[1]);
                    AbsDifferenceSum<align>(a, b, col + 2 * A, sums[2]);
                    AbsDifferenceSum<align>(a, b, col + 3 * A, sums[3]);
                }
                sums[0] = vec_add(vec_add(sums[0], sums[1]), vec_add(sums[2], sums[3]));
                for (; col < bodyWidth; col += A)
                    AbsDifferenceSum<align>(a, b, col, sums[0]);
                if (width - bodyWidth)
                    AbsDifferenceSumMasked<false>(a, b, width - A, tailMask, sums[0]);
                *sum += ExtractSum(sums[0]);
                a += aStride;
                b += bStride;
            }
        }
        void ConditionalCount8u(const uint8_t * src, size_t stride, size_t width, size_t height, uint8_t value, uint32_t * count)
        {
            assert(width >= A);
            if (align)
                assert(Aligned(src) && Aligned(stride));

            size_t alignedWidth = AlignLo(width, QA);
            size_t bodyWidth = AlignLo(width, A);
            v128_u8 tailMask = ShiftLeft(K8_01, A - width + alignedWidth);
            v128_u8 _value = SIMD_VEC_SET1_EPI8(value);
            v128_u32 counts[4] = { K32_00000000, K32_00000000, K32_00000000, K32_00000000 };
            for (size_t row = 0; row < height; ++row)
            {
                size_t col = 0;
                for (; col < alignedWidth; col += QA)
                {
                    ConditionalCount8u<align, compareType>(src, col, _value, counts[0]);
                    ConditionalCount8u<align, compareType>(src, col + A, _value, counts[1]);
                    ConditionalCount8u<align, compareType>(src, col + 2 * A, _value, counts[2]);
                    ConditionalCount8u<align, compareType>(src, col + 3 * A, _value, counts[3]);
                }
                for (; col < bodyWidth; col += A)
                    ConditionalCount8u<align, compareType>(src, col, _value, counts[0]);
                if (alignedWidth != width)
                {
                    const v128_u8 mask = vec_and(Compare8u<compareType>(Load<false>(src + width - A), _value), tailMask);
                    counts[0] = vec_msum(mask, K8_01, counts[0]);
                }
                src += stride;
            }
            counts[0] = vec_add(vec_add(counts[0], counts[1]), vec_add(counts[2], counts[3]));
            *count = ExtractSum(counts[0]);
        }
        template <bool align> SIMD_INLINE void SquaredDifferenceSum16f(const uint16_t * a, const uint16_t * b, size_t size, float * sum)
        {
            assert(size >= F);
            if (align)
                assert(Aligned(a) && Aligned(b));

            size_t partialAlignedSize = AlignLo(size, F);
            size_t fullAlignedSize = AlignLo(size, DF);
            size_t i = 0;
            float32x4_t sums[2] = { vdupq_n_f32(0), vdupq_n_f32(0) };
            if (fullAlignedSize)
            {
                for (; i < fullAlignedSize; i += DF)
                {
                    SquaredDifferenceSum16f<align>(a, b, i + F * 0, sums[0]);
                    SquaredDifferenceSum16f<align>(a, b, i + F * 1, sums[1]);
                }
                sums[0] = vaddq_f32(sums[0], sums[1]);
            }
            for (; i < partialAlignedSize; i += F)
                SquaredDifferenceSum16f<align>(a, b, i, sums[0]);
            if (partialAlignedSize != size)
            {
                float32x4_t tailMask = RightNotZero(size - partialAlignedSize);
                float32x4_t _a = vcvt_f32_f16((float16x4_t)LoadHalf<align>(a + size - F));
                float32x4_t _b = vcvt_f32_f16((float16x4_t)LoadHalf<align>(a + size - F));
                float32x4_t _d = And(vsubq_f32(_a, _b), tailMask);
                sums[0] = vaddq_f32(sums[0], vmulq_f32(_d, _d));
            }
            *sum = ExtractSum32f(sums[0]);
        }
Beispiel #5
0
        template <bool align> void BgrToGray(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * gray, size_t grayStride)
        {
            assert(width >= A);
            if(align)
                assert(Aligned(bgr) && Aligned(bgrStride) && Aligned(gray) && Aligned(grayStride));

            size_t alignedWidth = AlignLo(width, A);
            for(size_t row = 0; row < height; ++row)
            {
                Loader<align> _bgr(bgr);
                Storer<align> _gray(gray);
                BgrToGray<align, true>(_bgr, _gray);
                for(size_t col = A; col < alignedWidth; col += A)
                    BgrToGray<align, false>(_bgr, _gray);
                Flush(_gray);

                if(alignedWidth != width)
                {
                    Loader<false> _bgr(bgr + 3*(width - A));
                    Storer<false> _gray(gray + width - A);
                    BgrToGray<false, true>(_bgr, _gray);
                    Flush(_gray);
                }

                bgr += bgrStride;
                gray += grayStride;
            }
        }
        template <bool align> void EdgeBackgroundAdjustRangeMasked(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, 
            uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride)
        {
            assert(width >= A);
            if(align)
            {
                assert(Aligned(backgroundValue) && Aligned(backgroundValueStride));
                assert(Aligned(backgroundCount) && Aligned(backgroundCountStride));
                assert(Aligned(mask) && Aligned(maskStride));
            }

            const __m256i _threshold = _mm256_set1_epi8((char)threshold);
            size_t alignedWidth = AlignLo(width, A);
            __m256i tailMask = SetMask<uint8_t>(0, A - width + alignedWidth, 1);
            for(size_t row = 0; row < height; ++row)
            {
                for(size_t col = 0; col < alignedWidth; col += A)
                    EdgeBackgroundAdjustRangeMasked<align>(backgroundCount, backgroundValue, mask, col, _threshold, K8_01);
                if(alignedWidth != width)
                    EdgeBackgroundAdjustRangeMasked<false>(backgroundCount, backgroundValue, mask, width - A, _threshold, tailMask);
                backgroundValue += backgroundValueStride;
                backgroundCount += backgroundCountStride;
                mask += maskStride;
            }		
        }
        template <bool align> void AddFeatureDifference(const uint8_t * value, size_t valueStride, size_t width, size_t height, 
            const uint8_t * lo, size_t loStride, const uint8_t * hi, size_t hiStride,
            uint16_t weight, uint8_t * difference, size_t differenceStride)
        {
            assert(width >= A);
            if(align)
            {
                assert(Aligned(value) && Aligned(valueStride));
                assert(Aligned(lo) && Aligned(loStride));
                assert(Aligned(hi) && Aligned(hiStride));
                assert(Aligned(difference) && Aligned(differenceStride));
            }

            size_t alignedWidth = AlignLo(width, A);
            __m128i tailMask = ShiftLeft(K_INV_ZERO, A - width + alignedWidth);
            __m128i _weight = _mm_set1_epi16((short)weight);

            for(size_t row = 0; row < height; ++row)
            {
                for(size_t col = 0; col < alignedWidth; col += A)
                    AddFeatureDifference<align>(value, lo, hi, difference, col, _weight, K_INV_ZERO);
                if(alignedWidth != width)
                    AddFeatureDifference<false>(value, lo, hi, difference, width - A, _weight, tailMask);
                value += valueStride;
                lo += loStride;
                hi += hiStride;
                difference += differenceStride;
            }
        }
        template<bool align> void AbsSecondDerivativeHistogram(const uint8_t *src, size_t width, size_t height, size_t stride,
            size_t step, size_t indent, uint32_t * histogram)
        {
            memset(histogram, 0, sizeof(uint32_t)*HISTOGRAM_SIZE);

            Buffer buffer(stride);
            buffer.p += indent;
            src += indent*(stride + 1);
            height -= 2*indent;
            width -= 2*indent;

            ptrdiff_t bodyStart = (uint8_t*)AlignHi(buffer.p, A) - buffer.p;
            ptrdiff_t bodyEnd = bodyStart + AlignLo(width - bodyStart, A);
            size_t rowStep = step*stride;
            for(size_t row = 0; row < height; ++row)
            {
                if(bodyStart)
                    AbsSecondDerivative<false>(src, step, rowStep, buffer.p);
                for(ptrdiff_t col = bodyStart; col < bodyEnd; col += A)
                    AbsSecondDerivative<align>(src + col, step, rowStep, buffer.p + col);
                if(width != (size_t)bodyEnd)
                    AbsSecondDerivative<false>(src + width - A, step, rowStep, buffer.p + width - A);

                for(size_t i = 0; i < width; ++i)
                    ++histogram[buffer.p[i]];

                src += stride;
            }
        }
		template <bool align> void SquaredDifferenceSum(
			const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, 
			size_t width, size_t height, uint64_t * sum)
		{
			assert(width < 0x10000);
			if(align)
			{
				assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride));
			}

			size_t bodyWidth = AlignLo(width, A);
			__m128i tailMask = ShiftLeft(K_INV_ZERO, A - width + bodyWidth);
			__m128i fullSum = _mm_setzero_si128();
			for(size_t row = 0; row < height; ++row)
			{
				__m128i rowSum = _mm_setzero_si128();
				for(size_t col = 0; col < bodyWidth; col += A)
				{
					const __m128i a_ = Load<align>((__m128i*)(a + col));
					const __m128i b_ = Load<align>((__m128i*)(b + col)); 
					rowSum = _mm_add_epi32(rowSum, SquaredDifference(a_, b_));
				}
				if(width - bodyWidth)
				{
					const __m128i a_ = _mm_and_si128(tailMask, Load<false>((__m128i*)(a + width - A)));
					const __m128i b_ = _mm_and_si128(tailMask, Load<false>((__m128i*)(b + width - A))); 
					rowSum = _mm_add_epi32(rowSum, SquaredDifference(a_, b_));
				}
				fullSum = _mm_add_epi64(fullSum, HorizontalSum32(rowSum));
				a += aStride;
				b += bStride;
			}
			*sum = ExtractInt64Sum(fullSum);
		}
Beispiel #10
0
        template <bool align> void FillBgr(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red)
        {
            size_t size = width*3;
            size_t step = A*3;
            size_t alignedSize = AlignLo(width, A)*3;

            uint32_t bgrb = uint32_t(blue) | (uint32_t(green) << 8) | (uint32_t(red) << 16) | (uint32_t(blue) << 24);
            uint32_t grbg = uint32_t(green) | (uint32_t(red) << 8) | (uint32_t(blue) << 16) | (uint32_t(green) << 24);
            uint32_t rbgr = uint32_t(red) | (uint32_t(blue) << 8) | (uint32_t(green) << 16) | (uint32_t(red) << 24);

            __m128i bgrs[3];
            bgrs[0] = _mm_setr_epi32(bgrb, grbg, rbgr, bgrb);
            bgrs[1] = _mm_setr_epi32(grbg, rbgr, bgrb, grbg);
            bgrs[2] = _mm_setr_epi32(rbgr, bgrb, grbg, rbgr);
            for(size_t row = 0; row < height; ++row)
            {
                size_t offset = 0;
                for(; offset < alignedSize; offset += step)
                {
                    Store<align>((__m128i*)(dst + offset) + 0, bgrs[0]);
                    Store<align>((__m128i*)(dst + offset) + 1, bgrs[1]);
                    Store<align>((__m128i*)(dst + offset) + 2, bgrs[2]);
                }
                if(offset < size)
                {
                    offset = size - step;
                    Store<false>((__m128i*)(dst + offset) + 0, bgrs[0]);
                    Store<false>((__m128i*)(dst + offset) + 1, bgrs[1]);
                    Store<false>((__m128i*)(dst + offset) + 2, bgrs[2]);
                }
                dst += stride;
            }
        }
        void ConditionalCount16i(const uint8_t * src, size_t stride, size_t width, size_t height, int16_t value, uint32_t * count)
        {
            assert(width >= HA);
            if (align)
                assert(Aligned(src) && Aligned(stride));

            size_t alignedWidth = AlignLo(width, DA);
            size_t bodyWidth = Simd::AlignLo(width, HA);
            v128_u16 tailMask = ShiftLeft(K16_0001, HA - width + alignedWidth);
            v128_s16 _value = SIMD_VEC_SET1_EPI16(value);
            v128_u32 counts[4] = { K32_00000000, K32_00000000, K32_00000000, K32_00000000 };
            for (size_t row = 0; row < height; ++row)
            {
                const int16_t * s = (const int16_t *)src;
                size_t col = 0;
                for (; col < alignedWidth; col += DA)
                {
                    ConditionalCount16i<align, compareType>(s, col, _value, counts[0]);
                    ConditionalCount16i<align, compareType>(s, col + HA, _value, counts[1]);
                    ConditionalCount16i<align, compareType>(s, col + 2 * HA, _value, counts[2]);
                    ConditionalCount16i<align, compareType>(s, col + 3 * HA, _value, counts[3]);
                }
                for (; col < bodyWidth; col += HA)
                    ConditionalCount16i<align, compareType>(s, col, _value, counts[0]);
                if (alignedWidth != width)
                {
                    const v128_u16 mask = vec_and((v128_u16)Compare16i<compareType>(Load<false>(s + width - HA), _value), tailMask);
                    counts[0] = vec_msum(mask, K16_0001, counts[0]);
                }
                src += stride;
            }
            counts[0] = vec_add(vec_add(counts[0], counts[1]), vec_add(counts[2], counts[3]));
            *count = ExtractSum(counts[0]);
        }
Beispiel #12
0
        template <bool align> void StretchGray2x2(
            const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, 
            uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride)
        {
            assert(srcWidth*2 == dstWidth && srcHeight*2 == dstHeight && srcWidth >= A);
            if(align)
            {
                assert(Aligned(src) && Aligned(srcStride));
                assert(Aligned(dst) && Aligned(dstStride));
            }

            size_t alignedWidth = AlignLo(srcWidth, A);
            for(size_t row = 0; row < srcHeight; ++row)
            {
                uint8_t * dstEven = dst;
                uint8_t * dstOdd = dst + dstStride;
                for(size_t srcCol = 0, dstCol = 0; srcCol < alignedWidth; srcCol += A, dstCol += DA)
                {
                    __m256i value = LoadPermuted<align>((__m256i*)(src + srcCol));
                    StoreUnpacked<align>(value, dstEven + dstCol);
                    StoreUnpacked<align>(value, dstOdd + dstCol);
                }
                if(alignedWidth != srcWidth)
                {
                    __m256i value = LoadPermuted<false>((__m256i*)(src + srcWidth - A));
                    StoreUnpacked<false>(value, dstEven + dstWidth - 2*A);
                    StoreUnpacked<false>(value, dstOdd + dstWidth - 2*A);
                }
                src += srcStride;
                dst += 2*dstStride;
            }
        }
Beispiel #13
0
		template <bool align> void Yuv444pToHue(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, 
			size_t width, size_t height, uint8_t * hue, size_t hueStride)
		{
			assert(width >= A);
			if(align)
			{
				assert(Aligned(y) && Aligned(yStride) && Aligned(u) &&  Aligned(uStride));
				assert(Aligned(v) && Aligned(vStride) && Aligned(hue) && Aligned(hueStride));
			}

            const __m128 KF_255_DIV_6 = _mm_set_ps1(Base::KF_255_DIV_6);

			size_t bodyWidth = AlignLo(width, A);
			size_t tail = width - bodyWidth;
			for(size_t row = 0; row < height; row += 1)
			{
				for(size_t col = 0; col < bodyWidth; col += A)
				{
					Store<align>((__m128i*)(hue + col), YuvToHue8(Load<align>((__m128i*)(y + col)), 
						Load<align>((__m128i*)(u + col)), Load<align>((__m128i*)(v + col)), KF_255_DIV_6));
				}
				if(tail)
				{
					size_t offset = width - A;
					Store<false>((__m128i*)(hue + offset), YuvToHue8(Load<false>((__m128i*)(y + offset)), 
						Load<false>((__m128i*)(u + offset)), Load<false>((__m128i*)(v + offset)), KF_255_DIV_6));
				}
				y += yStride;
				u += uStride;
				v += vStride;
				hue += hueStride;
			}
		}
        template <bool align> void LbpEstimate(
            const uint8_t * src, size_t srcStride, size_t width, size_t height, uint8_t * dst, size_t dstStride)
        {
            assert(width >= 2);
            if (align)
                assert(Aligned(src) && Aligned(srcStride) && Aligned(dst) && Aligned(dstStride));

            size_t alignedWidth = AlignLo(width - 2, A) + 1;
            __mmask64 tailMask = Aligned(width - alignedWidth);

            memset(dst, 0, width);
            src += srcStride;
            dst += dstStride;
            for (size_t row = 2; row < height; ++row)
            {
                dst[0] = 0;
                size_t col = 1;
                for (; col < alignedWidth; col += A)
                    LbpEstimate<align, false>(src + col, srcStride, dst + col);
                if (col < width)
                    LbpEstimate<align, false>(src + col, srcStride, dst + col, tailMask);
                dst[width - 1] = 0;
                src += srcStride;
                dst += dstStride;
            }
            memset(dst, 0, width);
        }
Beispiel #15
0
        template <bool align> void BgraToYuv422p(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * y, size_t yStride,
            uint8_t * u, size_t uStride, uint8_t * v, size_t vStride)
        {
            assert((width%2 == 0) && (width >= DA));
            if(align)
            {
                assert(Aligned(y) && Aligned(yStride) && Aligned(u) &&  Aligned(uStride));
                assert(Aligned(v) && Aligned(vStride) && Aligned(bgra) && Aligned(bgraStride));
            }

            size_t alignedWidth = AlignLo(width, DA);
            const size_t A8 = A*8;
            for(size_t row = 0; row < height; ++row)
            {
                for(size_t colUV = 0, colY = 0, colBgra = 0; colY < alignedWidth; colY += DA, colUV += A, colBgra += A8)
                    BgraToYuv422p<align>(bgra + colBgra, y + colY, u + colUV, v + colUV);
                if(width != alignedWidth)
                {
                    size_t offset = width - DA;
                    BgraToYuv422p<false>(bgra + offset*4, y + offset, u + offset/2, v + offset/2);
                }
                y += yStride;
                u += uStride;
                v += vStride;
                bgra += bgraStride;
            }
        }
Beispiel #16
0
        void FillBgr(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red)
        {
            size_t size = width*3;
            size_t step = sizeof(size_t)*3;
            size_t alignedSize = AlignLo(width, sizeof(size_t))*3;
            size_t bgrs[3];
#if defined(SIMD_X64_ENABLE) || defined(SIMD_PPC64_ENABLE)
            bgrs[0] = Fill64(blue, green, red);
            bgrs[1] = Fill64(red, blue, green);
            bgrs[2] = Fill64(green, red, blue);

#else
            bgrs[0] = Fill32(blue, green, red);
            bgrs[1] = Fill32(green, red, blue);
            bgrs[2] = Fill32(red, blue, green);
#endif
            for(size_t row = 0; row < height; ++row)
            {
                size_t offset = 0;
                for(; offset < alignedSize; offset += step)
                {
                    ((size_t*)(dst + offset))[0] = bgrs[0];
                    ((size_t*)(dst + offset))[1] = bgrs[1];
                    ((size_t*)(dst + offset))[2] = bgrs[2];
                }
                for(; offset < size; offset += 3)
                {
                    (dst + offset)[0] = blue;
                    (dst + offset)[1] = green;
                    (dst + offset)[2] = red;
                }
                dst += stride;
            }
        }
Beispiel #17
0
        void FillBgra(uint8_t * dst, size_t stride, size_t width, size_t height, uint8_t blue, uint8_t green, uint8_t red, uint8_t alpha)
        {
#ifdef SIMD_BIG_ENDIAN
            uint32_t bgra32 = uint32_t(alpha) | (uint32_t(red) << 8) | (uint32_t(green) << 16) | (uint32_t(blue) << 24);
#else
            uint32_t bgra32 = uint32_t(blue) | (uint32_t(green) << 8) | (uint32_t(red) << 16) | (uint32_t(alpha) << 24);
#endif

#if defined(SIMD_X64_ENABLE) || defined(SIMD_PPC64_ENABLE)
            uint64_t bgra64 = uint64_t(bgra32) | (uint64_t(bgra32) << 32);
            size_t alignedWidth = AlignLo(width, 2);
            for(size_t row = 0; row < height; ++row)
            {
                for(size_t col = 0; col < alignedWidth; col += 2)
                    *((uint64_t*)((uint32_t*)dst + col)) = bgra64;
                if(width != alignedWidth)
                    ((uint32_t*)dst)[width - 1] = bgra32;
                dst += stride;
            }
#else
            for(size_t row = 0; row < height; ++row)
            {
                for(size_t col = 0; col < width; ++col)
                    ((uint32_t*)dst)[col] = bgra32;
                dst += stride;
            }
#endif        
        }
Beispiel #18
0
		template <bool align> void Yuv444pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, 
			size_t width, size_t height, uint8_t * bgr, size_t bgrStride)
		{
			assert(width >= A);
			if(align)
			{
				assert(Aligned(y) && Aligned(yStride) && Aligned(u) &&  Aligned(uStride));
				assert(Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride));
			}

			size_t bodyWidth = AlignLo(width, A);
			size_t tail = width - bodyWidth;
            size_t A3 = A*3;
			for(size_t row = 0; row < height; ++row)
			{
				for(size_t colYuv = 0, colBgr = 0; colYuv < bodyWidth; colYuv += A, colBgr += A3)
				{
					Yuv444pToBgr<align>(y + colYuv, u + colYuv, v + colYuv, bgr + colBgr);
				}
				if(tail)
				{
					size_t col = width - A;
					Yuv444pToBgr<false>(y + col, u + col, v + col, bgr + 3*col);
				}
				y += yStride;
				u += uStride;
				v += vStride;
				bgr += bgrStride;
			}
		}
Beispiel #19
0
        template <bool align> void InterleaveUv(const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, size_t width, size_t height, uint8_t * uv, size_t uvStride)
        {
            assert(width >= A);
            if(align)
            {
                assert(Aligned(uv) && Aligned(uvStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride));
            }

            size_t bodyWidth = AlignLo(width, A);
            size_t tail = width - bodyWidth;
            for(size_t row = 0; row < height; ++row)
            {
				for (size_t col = 0, offset = 0; col < bodyWidth; col += A, offset += DA)
					InterleaveUv<align>(u + col, v + col, uv + offset);
                if(tail)
                {
                    size_t col = width - A;
                    size_t offset = 2*col;
					InterleaveUv<false>(u + col, v + col, uv + offset);
                }
                u += uStride;
                v += vStride;
                uv += uvStride;
            }
        }
Beispiel #20
0
        template <bool align> void Yuv422pToBgr(const uint8_t * y, size_t yStride, const uint8_t * u, size_t uStride, const uint8_t * v, size_t vStride, 
            size_t width, size_t height, uint8_t * bgr, size_t bgrStride)
        {
            assert((width%2 == 0) && (width >= DA));
            if(align)
            {
                assert(Aligned(y) && Aligned(yStride) && Aligned(u) &&  Aligned(uStride));
                assert(Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride));
            }

            size_t bodyWidth = AlignLo(width, DA);
            size_t tail = width - bodyWidth;
            size_t A6 = A*6;
            for(size_t row = 0; row < height; ++row)
            {
                for(size_t colUV = 0, colY = 0, colBgr = 0; colY < bodyWidth; colY += DA, colUV += A, colBgr += A6)
                    Yuv422pToBgr<align>(y + colY, u + colUV, v + colUV, bgr + colBgr);
                if(tail)
                {
                    size_t offset = width - DA;
                    Yuv422pToBgr<false>(y + offset, u + offset/2, v + offset/2, bgr + 3*offset);
                }
                y += yStride;
                u += uStride;
                v += vStride;
                bgr += bgrStride;
            }
        }
		template <bool align> void EdgeBackgroundAdjustRangeMasked(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height,
			uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride)
		{
			assert(width >= A);
			if (align)
			{
				assert(Aligned(backgroundValue) && Aligned(backgroundValueStride));
				assert(Aligned(backgroundCount) && Aligned(backgroundCountStride));
				assert(Aligned(mask) && Aligned(maskStride));
			}

			const uint8x16_t _threshold = vld1q_dup_u8(&threshold);
			size_t alignedWidth = AlignLo(width, A);
			uint8x16_t tailMask = ShiftLeft(K8_01, A - width + alignedWidth);
			for (size_t row = 0; row < height; ++row)
			{
				for (size_t col = 0; col < alignedWidth; col += A)
					EdgeBackgroundAdjustRangeMasked<align>(backgroundCount, backgroundValue, mask, col, _threshold, K8_01);
				if (alignedWidth != width)
					EdgeBackgroundAdjustRangeMasked<false>(backgroundCount, backgroundValue, mask, width - A, _threshold, tailMask);
				backgroundValue += backgroundValueStride;
				backgroundCount += backgroundCountStride;
				mask += maskStride;
			}
		}
Beispiel #22
0
        void SvmSumLinear(const float * x, const float * svs, const float * weights, size_t length, size_t count, float * sum)
        {
            Buffer buffer(count);
            size_t alignedCount = AlignLo(count, 4);

            for(size_t j = 0; j < length; ++j)
            {
                size_t i = 0;
                float v = x[j];
                __m128 _v = _mm_set1_ps(v);
                for(; i < alignedCount; i += 4)
                {
                    __m128 sums = Load<true>(buffer.sums + i);
                    __m128 _svs = Load<false>(svs + i);
                    Store<true>(buffer.sums + i, _mm_add_ps(sums, _mm_mul_ps(_v, _svs)));
                }
                for(; i < count; ++i)
                    buffer.sums[i] += v*svs[i];
                svs += count;
            }

            size_t i = 0;
            __m128 _sum = _mm_setzero_ps();
            for(; i < alignedCount; i += 4)
            {
                __m128 sums = Load<true>(buffer.sums + i);
                __m128 _weights = Load<false>(weights + i);
                _sum = _mm_add_ps(_sum, _mm_mul_ps(sums, _weights));
            }
            *sum = ExtractSum(_sum);
            for(; i < count; ++i)
                *sum += buffer.sums[i]*weights[i];
        }
Beispiel #23
0
        template <bool align> void StretchGray2x2(
            const uint8_t * src, size_t srcWidth, size_t srcHeight, size_t srcStride, 
            uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride)
        {
            assert(srcWidth*2 == dstWidth && srcHeight*2 == dstHeight && srcWidth >= A);
            if(align)
            {
                assert(Aligned(src) && Aligned(srcStride));
                assert(Aligned(dst) && Aligned(dstStride));
            }

            size_t alignedWidth = AlignLo(srcWidth, A);
            for(size_t row = 0; row < srcHeight; ++row)
            {
                Storer<align> even(dst), odd(dst + dstStride);
                StretchGray2x2<align, true>(src, even, odd);
                for(size_t col = A; col < alignedWidth; col += A)
                    StretchGray2x2<align, false>(src + col, even, odd);
                Flush(even, odd);

                if(alignedWidth != srcWidth)
                {
                    Storer<false> even(dst + dstWidth - 2*A), odd(dst + dstStride + dstWidth - 2*A);
                    StretchGray2x2<false, true>(src + srcWidth - A, even, odd);
                    Flush(even, odd);
                }
                src += srcStride;
                dst += 2*dstStride;
            }
        }
Beispiel #24
0
        template <bool align> void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height,
            const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha)
        {
            assert(width >= HA);
            if(align)
            {
                assert(Aligned(blue) && Aligned(blueStride));
                assert(Aligned(green) && Aligned(greenStride));
                assert(Aligned(red) && Aligned(redStride));
                assert(Aligned(bgra) && Aligned(bgraStride));
            }

            v128_u8 _alpha = SetU8(alpha);
            size_t alignedWidth = AlignLo(width, HA);
            for(size_t row = 0; row < height; ++row)
            {
                Storer<align> _bgra(bgra);
                Bgr48pToBgra32<align, true>(blue, green, red, 0, _alpha, _bgra);
                for(size_t col = HA; col < alignedWidth; col += HA)
                    Bgr48pToBgra32<align, false>(blue, green, red, col*2, _alpha, _bgra);
                Flush(_bgra);

                if(width != alignedWidth)
                {
                    Storer<false> _bgra(bgra + (width - HA)*4);
                    Bgr48pToBgra32<false, true>(blue, green, red, (width - HA)*2, _alpha, _bgra);
                    Flush(_bgra);
                }

                blue += blueStride;
                green += greenStride;
                red += redStride;
                bgra += bgraStride;
            }
        }
Beispiel #25
0
		template <bool align> void BgraToBgr(const uint8_t * bgra, size_t width, size_t height, size_t bgraStride, uint8_t * bgr, size_t bgrStride)
		{
            assert(width >= A);
            if(align)
                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride));

            size_t alignedWidth = AlignLo(width, A);
            if(width == alignedWidth)
                alignedWidth -= A;

            __m128i k[3][2];
            k[0][0] = _mm_setr_epi8(0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE,  -1,  -1,  -1,  -1);
            k[0][1] = _mm_setr_epi8( -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1, 0x0, 0x1, 0x2, 0x4);
            k[1][0] = _mm_setr_epi8(0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1);
            k[1][1] = _mm_setr_epi8( -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1, 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9);
            k[2][0] = _mm_setr_epi8(0xA, 0xC, 0xD, 0xE,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1,  -1);
            k[2][1] = _mm_setr_epi8( -1,  -1,  -1,  -1, 0x0, 0x1, 0x2, 0x4, 0x5, 0x6, 0x8, 0x9, 0xA, 0xC, 0xD, 0xE);

			for(size_t row = 0; row < height; ++row)
			{
                for(size_t col = 0; col < alignedWidth; col += A)
                    BgraToBgrBody<align>(bgra + 4*col, bgr + 3*col, k);
                if(width != alignedWidth)
                    BgraToBgr<false>(bgra + 4*(width - A), bgr + 3*(width - A), k);
                bgra += bgraStride;
                bgr += bgrStride;
			}
		}
Beispiel #26
0
        template <bool align> void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha)
        {
            assert(width >= A);
            if(align)
                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride));

            size_t alignedWidth = AlignLo(width, A);
            if(width == alignedWidth)
                alignedWidth -= A;

            const v128_u8 _alpha = SetU8(alpha);

            for(size_t row = 0; row < height; ++row)
            {
                Loader<align> _bgr(bgr);
                Storer<align> _bgra(bgra);
                BgrToBgra<align, true>(_bgr, _alpha, _bgra);
                for(size_t col = A; col < alignedWidth; col += A)
                    BgrToBgra<align, false>(_bgr, _alpha, _bgra);
                Flush(_bgra);

                if(width != alignedWidth)
                {
                    Loader<false> _bgr(bgr + 3*(width - A));
                    Storer<false> _bgra(bgra + 4*(width - A));
                    BgrToBgra<false, true>(_bgr, _alpha, _bgra);
                    Flush(_bgra);
                }

                bgra += bgraStride;
                bgr += bgrStride;
            }
        }
Beispiel #27
0
        template <bool align> void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height,
            const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha)
        {
            assert(width >= HA);
            if(align)
            {
                assert(Aligned(blue) && Aligned(blueStride));
                assert(Aligned(green) && Aligned(greenStride));
                assert(Aligned(red) && Aligned(redStride));
                assert(Aligned(bgra) && Aligned(bgraStride));
            }

            __m128i _alpha = _mm_slli_si128(_mm_set1_epi16(alpha), 1);
            size_t alignedWidth = AlignLo(width, HA);
            for(size_t row = 0; row < height; ++row)
            {
                for(size_t col = 0, srcOffset = 0, dstOffset = 0; col < alignedWidth; col += HA, srcOffset += A, dstOffset += DA)
                    Bgr48pToBgra32<align>(bgra + dstOffset, blue, green, red, srcOffset, _alpha);
                if(width != alignedWidth)
                    Bgr48pToBgra32<false>(bgra + (width - HA)*4, blue, green, red, (width - HA)*2, _alpha);
                blue += blueStride;
                green += greenStride;
                red += redStride;
                bgra += bgraStride;
            }
        }
Beispiel #28
0
        template <bool align> void BgrToYuv444p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride,
            uint8_t * u, size_t uStride, uint8_t * v, size_t vStride)
        {
            assert(width >= A);
            if(align)
            {
                assert(Aligned(y) && Aligned(yStride) && Aligned(u) &&  Aligned(uStride));
                assert(Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride));
            }

            size_t alignedWidth = AlignLo(width, A);
            const size_t A3 = A*3;
            for(size_t row = 0; row < height; ++row)
            {
                for(size_t col = 0, colBgr = 0; col < alignedWidth; col += A, colBgr += A3)
                    BgrToYuv444p<align>(bgr + colBgr, y + col, u + col, v + col);
                if(width != alignedWidth)
                {
                    size_t col = width - A;
                    BgrToYuv444p<false>(bgr + col*3, y + col, u + col, v + col);
                }
                y += yStride;
                u += uStride;
                v += vStride;
                bgr += bgrStride;
            }
        }
Beispiel #29
0
        template <bool align> void DeinterleaveUv(const uint8_t * uv, size_t uvStride, size_t width, size_t height, 
            uint8_t * u, size_t uStride, uint8_t * v, size_t vStride)
        {
            assert(width >= A);
            if(align)
                assert(Aligned(uv) && Aligned(uvStride) && Aligned(u) && Aligned(uStride) && Aligned(v) && Aligned(vStride));

            size_t alignedWidth = AlignLo(width, A);
            for(size_t row = 0; row < height; ++row)
            {
                Loader<align> _uv(uv);
                Storer<align> _u(u), _v(v);
                DeinterleavedUv<align, true>(_uv, _u, _v);
                for(size_t col = A; col < alignedWidth; col += A)
                    DeinterleavedUv<align, false>(_uv, _u, _v);
                Flush(_u, _v);

                if(width != alignedWidth)
                {
                    Loader<false> _uv(uv + 2*(width - A));
                    Storer<false> _u(u + width - A), _v(v + width - A);
                    DeinterleavedUv<false, true>(_uv, _u, _v);
                    Flush(_u, _v);
                }

                uv += uvStride;
                u += uStride;
                v += vStride;
            }
        }
Beispiel #30
0
        template <bool align> void BgrToYuv420p(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * y, size_t yStride,
            uint8_t * u, size_t uStride, uint8_t * v, size_t vStride)
        {
            assert((width%2 == 0) && (height%2 == 0) && (width >= DA) && (height >= 2));
            if(align)
            {
                assert(Aligned(y) && Aligned(yStride) && Aligned(u) &&  Aligned(uStride));
                assert(Aligned(v) && Aligned(vStride) && Aligned(bgr) && Aligned(bgrStride));
            }

            size_t alignedWidth = AlignLo(width, DA);
            const size_t A6 = A*6;
            for(size_t row = 0; row < height; row += 2)
            {
                for(size_t colUV = 0, colY = 0, colBgr = 0; colY < alignedWidth; colY += DA, colUV += A, colBgr += A6)
                    BgrToYuv420p<align>(bgr + colBgr, bgrStride, y + colY, yStride, u + colUV, v + colUV);
                if(width != alignedWidth)
                {
                    size_t offset = width - DA;
                    BgrToYuv420p<false>(bgr + offset*3, bgrStride, y + offset, yStride, u + offset/2, v + offset/2);
                }
                y += 2*yStride;
                u += uStride;
                v += vStride;
                bgr += 2*bgrStride;
            }
        }