void ConditionalCount16i(const uint8_t * src, size_t stride, size_t width, size_t height, int16_t value, uint32_t * count)
        {
            assert(width >= HA);
            if (align)
                assert(Aligned(src) && Aligned(stride));

            size_t alignedWidth = AlignLo(width, DA);
            size_t bodyWidth = Simd::AlignLo(width, HA);
            v128_u16 tailMask = ShiftLeft(K16_0001, HA - width + alignedWidth);
            v128_s16 _value = SIMD_VEC_SET1_EPI16(value);
            v128_u32 counts[4] = { K32_00000000, K32_00000000, K32_00000000, K32_00000000 };
            for (size_t row = 0; row < height; ++row)
            {
                const int16_t * s = (const int16_t *)src;
                size_t col = 0;
                for (; col < alignedWidth; col += DA)
                {
                    ConditionalCount16i<align, compareType>(s, col, _value, counts[0]);
                    ConditionalCount16i<align, compareType>(s, col + HA, _value, counts[1]);
                    ConditionalCount16i<align, compareType>(s, col + 2 * HA, _value, counts[2]);
                    ConditionalCount16i<align, compareType>(s, col + 3 * HA, _value, counts[3]);
                }
                for (; col < bodyWidth; col += HA)
                    ConditionalCount16i<align, compareType>(s, col, _value, counts[0]);
                if (alignedWidth != width)
                {
                    const v128_u16 mask = vec_and((v128_u16)Compare16i<compareType>(Load<false>(s + width - HA), _value), tailMask);
                    counts[0] = vec_msum(mask, K16_0001, counts[0]);
                }
                src += stride;
            }
            counts[0] = vec_add(vec_add(counts[0], counts[1]), vec_add(counts[2], counts[3]));
            *count = ExtractSum(counts[0]);
        }
Esempio n. 2
0
        void SvmSumLinear(const float * x, const float * svs, const float * weights, size_t length, size_t count, float * sum)
        {
            Buffer buffer(count);
            size_t alignedCount = AlignLo(count, 4);

            for(size_t j = 0; j < length; ++j)
            {
                size_t i = 0;
                float v = x[j];
                __m128 _v = _mm_set1_ps(v);
                for(; i < alignedCount; i += 4)
                {
                    __m128 sums = Load<true>(buffer.sums + i);
                    __m128 _svs = Load<false>(svs + i);
                    Store<true>(buffer.sums + i, _mm_add_ps(sums, _mm_mul_ps(_v, _svs)));
                }
                for(; i < count; ++i)
                    buffer.sums[i] += v*svs[i];
                svs += count;
            }

            size_t i = 0;
            __m128 _sum = _mm_setzero_ps();
            for(; i < alignedCount; i += 4)
            {
                __m128 sums = Load<true>(buffer.sums + i);
                __m128 _weights = Load<false>(weights + i);
                _sum = _mm_add_ps(_sum, _mm_mul_ps(sums, _weights));
            }
            *sum = ExtractSum(_sum);
            for(; i < count; ++i)
                *sum += buffer.sums[i]*weights[i];
        }
        void ConditionalCount8u(const uint8_t * src, size_t stride, size_t width, size_t height, uint8_t value, uint32_t * count)
        {
            assert(width >= A);
            if (align)
                assert(Aligned(src) && Aligned(stride));

            size_t alignedWidth = AlignLo(width, QA);
            size_t bodyWidth = AlignLo(width, A);
            v128_u8 tailMask = ShiftLeft(K8_01, A - width + alignedWidth);
            v128_u8 _value = SIMD_VEC_SET1_EPI8(value);
            v128_u32 counts[4] = { K32_00000000, K32_00000000, K32_00000000, K32_00000000 };
            for (size_t row = 0; row < height; ++row)
            {
                size_t col = 0;
                for (; col < alignedWidth; col += QA)
                {
                    ConditionalCount8u<align, compareType>(src, col, _value, counts[0]);
                    ConditionalCount8u<align, compareType>(src, col + A, _value, counts[1]);
                    ConditionalCount8u<align, compareType>(src, col + 2 * A, _value, counts[2]);
                    ConditionalCount8u<align, compareType>(src, col + 3 * A, _value, counts[3]);
                }
                for (; col < bodyWidth; col += A)
                    ConditionalCount8u<align, compareType>(src, col, _value, counts[0]);
                if (alignedWidth != width)
                {
                    const v128_u8 mask = vec_and(Compare8u<compareType>(Load<false>(src + width - A), _value), tailMask);
                    counts[0] = vec_msum(mask, K8_01, counts[0]);
                }
                src += stride;
            }
            counts[0] = vec_add(vec_add(counts[0], counts[1]), vec_add(counts[2], counts[3]));
            *count = ExtractSum(counts[0]);
        }
        template <bool align> void AbsDifferenceSum(
            const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride,
            size_t width, size_t height, uint64_t * sum)
        {
            assert(width >= A);
            if (align)
                assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride));

            size_t alignedWidth = AlignLo(width, QA);
            size_t bodyWidth = AlignLo(width, A);
            v128_u8 tailMask = ShiftLeft(K8_FF, A - width + bodyWidth);
            *sum = 0;
            for (size_t row = 0; row < height; ++row)
            {
                size_t col = 0;
                v128_u32 sums[4] = { K32_00000000, K32_00000000, K32_00000000, K32_00000000 };
                for (; col < alignedWidth; col += QA)
                {
                    AbsDifferenceSum<align>(a, b, col, sums[0]);
                    AbsDifferenceSum<align>(a, b, col + A, sums[1]);
                    AbsDifferenceSum<align>(a, b, col + 2 * A, sums[2]);
                    AbsDifferenceSum<align>(a, b, col + 3 * A, sums[3]);
                }
                sums[0] = vec_add(vec_add(sums[0], sums[1]), vec_add(sums[2], sums[3]));
                for (; col < bodyWidth; col += A)
                    AbsDifferenceSum<align>(a, b, col, sums[0]);
                if (width - bodyWidth)
                    AbsDifferenceSumMasked<false>(a, b, width - A, tailMask, sums[0]);
                *sum += ExtractSum(sums[0]);
                a += aStride;
                b += bStride;
            }
        }
        void ConditionalSquareGradientSum(const uint8_t * src, size_t srcStride, size_t width, size_t height,
            const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum)
        {
            assert(width >= A + 2 && height >= 3);
            if (align)
                assert(Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride));

            src += srcStride;
            mask += maskStride;
            height -= 2;

            size_t bodyWidth = Simd::AlignLo(width - 1, A);
            v128_u8 noseMask = ShiftRight(K8_FF, 1);
            v128_u8 tailMask = ShiftLeft(K8_FF, A - width + 1 + bodyWidth);
            size_t alignedWidth = Simd::AlignLo(bodyWidth - A, DA);

            v128_u8 _value = SetU8(value);
            *sum = 0;
            for (size_t row = 0; row < height; ++row)
            {
                v128_u32 sums[4] = { K32_00000000, K32_00000000, K32_00000000, K32_00000000 };
                {
                    const v128_u8 _mask = vec_and(Compare8u<compareType>(Load<false>(mask + 1), _value), noseMask);
                    AddSquareDifference<false>(src + 1, 1, _mask, sums[0]);
                    AddSquareDifference<false>(src + 1, srcStride, _mask, sums[1]);
                }
                size_t col = A;
                for (; col < alignedWidth; col += DA)
                {
                    ConditionalSquareGradientSum<align, compareType>(src, srcStride, mask, col, _value, sums);
                    ConditionalSquareGradientSum<align, compareType>(src, srcStride, mask, col + A, _value, sums + 2);
                }
                for (; col < bodyWidth; col += A)
                    ConditionalSquareGradientSum<align, compareType>(src, srcStride, mask, col, _value, sums);
                if (bodyWidth != width - 1)
                {
                    size_t offset = width - A - 1;
                    const v128_u8 _mask = vec_and(Compare8u<compareType>(Load<false>(mask + offset), _value), tailMask);
                    AddSquareDifference<false>(src + offset, 1, _mask, sums[0]);
                    AddSquareDifference<false>(src + offset, srcStride, _mask, sums[1]);
                }
                sums[0] = vec_add(vec_add(sums[0], sums[1]), vec_add(sums[2], sums[3]));
                *sum += ExtractSum(sums[0]);
                src += srcStride;
                mask += maskStride;
            }
        }
        template <bool align> void AbsDifferenceSums3x3Masked(const uint8_t *current, size_t currentStride, const uint8_t *background, size_t backgroundStride,
            const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sums)
        {
            assert(height > 2 && width >= A + 2);
            if (align)
                assert(Aligned(background) && Aligned(backgroundStride));

            width -= 2;
            height -= 2;
            current += 1 + currentStride;
            background += 1 + backgroundStride;
            mask += 1 + maskStride;

            size_t bodyWidth = AlignLo(width, A);
            v128_u8 tailMask = ShiftLeft(K8_FF, A - width + bodyWidth);
            v128_u8 _index = SetU8(index);

            for (size_t i = 0; i < 9; ++i)
                sums[i] = 0;

            for (size_t row = 0; row < height; ++row)
            {
                v128_u32 _sums[9];
                for (size_t i = 0; i < 9; ++i)
                    _sums[i] = K32_00000000;

                for (size_t col = 0; col < bodyWidth; col += A)
                {
                    const v128_u8 _mask = LoadMaskU8<false>(mask + col, _index);
                    const v128_u8 _current = vec_and(Load<false>(current + col), _mask);
                    AbsDifferenceSums3x3Masked<align>(_current, background + col, backgroundStride, _mask, _sums);
                }
                if (width - bodyWidth)
                {
                    const v128_u8 _mask = vec_and(LoadMaskU8<false>(mask + width - A, _index), tailMask);
                    const v128_u8 _current = vec_and(Load<false>(current + width - A), _mask);
                    AbsDifferenceSums3x3Masked<false>(_current, background + width - A, backgroundStride, _mask, _sums);
                }

                for (size_t i = 0; i < 9; ++i)
                    sums[i] += ExtractSum(_sums[i]);

                current += currentStride;
                background += backgroundStride;
                mask += maskStride;
            }
        }
Esempio n. 7
0
        template <bool align> void LaplaceAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum)
        {
            assert(width > A);
            if(align)
                assert(Aligned(src) && Aligned(stride));

            size_t bodyWidth = Simd::AlignHi(width, A) - A;
            const uint8_t *src0, *src1, *src2;
            v128_u8 a[3][3];
            v128_u8 tailMask = ShiftLeft(K8_FF, A - width + bodyWidth);
            *sum = 0;

            for(size_t row = 0; row < height; ++row)
            {
                src0 = src + stride*(row - 1);
                src1 = src0 + stride;
                src2 = src1 + stride;
                if(row == 0)
                    src0 = src1;
                if(row == height - 1)
                    src2 = src1;

                v128_u32 sums[2] = {K32_00000000, K32_00000000};

                LoadNose3<align, 1>(src0 + 0, a[0]);
                LoadNose3<align, 1>(src1 + 0, a[1]);
                LoadNose3<align, 1>(src2 + 0, a[2]);
                LaplaceAbsSum(a, sums);
                for(size_t col = A; col < bodyWidth; col += A)
                {
                    LoadBody3<align, 1>(src0 + col, a[0]);
                    LoadBody3<align, 1>(src1 + col, a[1]);
                    LoadBody3<align, 1>(src2 + col, a[2]);
                    LaplaceAbsSum(a, sums);
                }
                LoadTail3<false, 1>(src0 + width - A, a[0]);
                LoadTail3<false, 1>(src1 + width - A, a[1]);
                LoadTail3<false, 1>(src2 + width - A, a[2]);
                SetMask3x3(a, tailMask);
                LaplaceAbsSum(a, sums);

                *sum += ExtractSum(vec_add(sums[0], sums[1]));
            }
        }
        template <bool align> void AbsDifferenceSums3x3(const uint8_t * current, size_t currentStride,
            const uint8_t * background, size_t backgroundStride, size_t width, size_t height, uint64_t * sums)
        {
            assert(height > 2 && width >= A + 2);
            if (align)
                assert(Aligned(background) && Aligned(backgroundStride));

            width -= 2;
            height -= 2;
            current += 1 + currentStride;
            background += 1 + backgroundStride;

            size_t alignedWidth = AlignLo(width, DA);
            size_t bodyWidth = AlignLo(width, A);
            v128_u8 tailMask = ShiftLeft(K8_FF, A - width + bodyWidth);

            memset(sums, 0, 9 * sizeof(uint64_t));

            for (size_t row = 0; row < height; ++row)
            {
                v128_u32 _sums[2][9];
                memset(_sums, 0, 18 * sizeof(v128_u32));

                size_t col = 0;
                for (; col < alignedWidth; col += DA)
                {
                    AbsDifferenceSums3x3<align>(Load<false>(current + col), background + col, backgroundStride, _sums[0]);
                    AbsDifferenceSums3x3<align>(Load<false>(current + col + A), background + col + A, backgroundStride, _sums[0]);
                }
                for (; col < bodyWidth; col += A)
                    AbsDifferenceSums3x3<align>(Load<false>(current + col), background + col, backgroundStride, _sums[0]);
                if (width - bodyWidth)
                {
                    const v128_u8 _current = vec_and(tailMask, Load<false>(current + width - A));
                    AbsDifferenceSums3x3Masked<false>(_current, background + width - A, backgroundStride, tailMask, _sums[0]);
                }

                for (size_t i = 0; i < 9; ++i)
                    sums[i] += ExtractSum(vec_add(_sums[0][i], _sums[1][i]));

                current += currentStride;
                background += backgroundStride;
            }
        }
Esempio n. 9
0
		void SobelDxAbsSum(const uint8_t * src, size_t stride, size_t width, size_t height, uint64_t * sum)
		{
			assert(width > A);
			size_t bodyWidth = Simd::AlignHi(width, A) - A;
			const uint8_t *src0, *src1, *src2;

			v16u8 a[3][3];
			v2u64 fullSum = Zero<v2u64>();
			const v16u8 K8_FF = Fill((uint8_t)0xff);
			v16u8 tailMask = ShiftLeft(K8_FF, A - width + bodyWidth);

			for (size_t row = 0; row < height; ++row)
			{
				src0 = src + stride*(row - 1);
				src1 = src0 + stride;
				src2 = src1 + stride;
				if (row == 0)
					src0 = src1;
				if (row == height - 1)
					src2 = src1;

				v4u32 rowSum = Zero<v4u32>();

				LoadNoseDx(src0 + 0, a[0]);
				LoadNoseDx(src1 + 0, a[1]);
				LoadNoseDx(src2 + 0, a[2]);
				SobelDxAbsSum(a, rowSum);
				for (size_t col = A; col < bodyWidth; col += A)
				{
					LoadBodyDx(src0 + col, a[0]);
					LoadBodyDx(src1 + col, a[1]);
					LoadBodyDx(src2 + col, a[2]);
					SobelDxAbsSum(a, rowSum);
				}
				LoadTailDx(src0 + width - A, a[0]);
				LoadTailDx(src1 + width - A, a[1]);
				LoadTailDx(src2 + width - A, a[2]);
				SetMask3x3(a, tailMask);
				SobelDxAbsSum(a, rowSum);
				fullSum = PadSum(fullSum,rowSum);
			}
			*sum = ExtractSum(fullSum);
        }
        void ConditionalSquareSum(const uint8_t * src, size_t srcStride, size_t width, size_t height,
            const uint8_t * mask, size_t maskStride, uint8_t value, uint64_t * sum)
        {
            assert(width >= A);
            if (align)
                assert(Aligned(src) && Aligned(srcStride) && Aligned(mask) && Aligned(maskStride));

            size_t alignedWidth = AlignLo(width, QA);
            size_t bodyWidth = AlignLo(width, A);
            v128_u8 tailMask = ShiftLeft(K8_FF, A - width + alignedWidth);
            v128_u8 _value = SetU8(value);
            *sum = 0;
            for (size_t row = 0; row < height; ++row)
            {
                size_t col = 0;
                v128_u32 sums[4] = { K32_00000000, K32_00000000, K32_00000000, K32_00000000 };
                for (; col < alignedWidth; col += QA)
                {
                    ConditionalSquareSum<align, compareType>(src, mask, col, _value, sums[0]);
                    ConditionalSquareSum<align, compareType>(src, mask, col + A, _value, sums[1]);
                    ConditionalSquareSum<align, compareType>(src, mask, col + 2 * A, _value, sums[2]);
                    ConditionalSquareSum<align, compareType>(src, mask, col + 3 * A, _value, sums[3]);
                }
                sums[0] = vec_add(vec_add(sums[0], sums[1]), vec_add(sums[2], sums[3]));
                for (; col < bodyWidth; col += A)
                    ConditionalSquareSum<align, compareType>(src, mask, col, _value, sums[0]);
                if (alignedWidth != width)
                {
                    const v128_u8 _mask = Compare8u<compareType>(Load<false>(mask + width - A), _value);
                    const v128_u8 _src = vec_and(vec_and(Load<false>(src + width - A), _mask), tailMask);
                    sums[0] = vec_msum(_src, _src, sums[0]);
                }
                *sum += ExtractSum(sums[0]);
                src += srcStride;
                mask += maskStride;
            }
        }