コード例 #1
0
ファイル: neon_search.cpp プロジェクト: magurosan/strlen_neon
size_t strlenNEON(const char *p)
{
	const char *const top = p;
	uint8x16_t c16 = vdupq_n_u8(0);
	/* 16 byte alignment */
	size_t n = reinterpret_cast<size_t>(p) & 15;
	if (n > 0) {
		uint8x16_t x = *(const uint8x16_t*)&p[-n];
		uint8x16_t a = vceqq_u8(x, c16);
		unsigned long mask = GetByteMask(a) << (16 + n);
		if (mask) {
			return __builtin_clz(mask);
		}
		p += 16 - n;
	}
	assert((reinterpret_cast<size_t>(p) & 15) == 0);
	for (;;) {
		uint8x16_t x = *(const uint8x16_t*)&p[0];
		uint8x16_t a = vceqq_u8(x, c16);

		if (isFound(a)) {
 			unsigned int mask = GetByteMask(a);
			return p + __builtin_clz(mask) - top;
		}
		p += 16;
	}
}
コード例 #2
0
ファイル: vceqQu8.c プロジェクト: AlexMioMio/gcc
void test_vceqQu8 (void)
{
  uint8x16_t out_uint8x16_t;
  uint8x16_t arg0_uint8x16_t;
  uint8x16_t arg1_uint8x16_t;

  out_uint8x16_t = vceqq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
}
コード例 #3
0
ファイル: neon_search.cpp プロジェクト: magurosan/strlen_neon
void *memchrNEON(const void *ptr, int c, size_t len)
{
	const char *p = reinterpret_cast<const char*>(ptr);

	if (len >= 16) {
		uint8x16_t c16 = vdupq_n_u8(static_cast<char>(c));
		/* 16 byte alignment */
		size_t n = reinterpret_cast<size_t>(p) & 15;
		if (n > 0) {
			uint8x16_t x = *(const uint8x16_t*)&p[-n];
			uint8x16_t a = vceqq_u8(x, c16);

			unsigned long mask = GetByteMask(a) << (16 + n);
			if (mask) {
				return (void*)(p + __builtin_clz(mask));
			}
			n = 16 - n;
			len -= n;
			p += n;
		}
		while (len >= 32) {
			uint8x16_t x = *(const uint8x16_t*)&p[0];
			uint8x16_t y = *(const uint8x16_t*)&p[16];
			uint8x16_t a = vceqq_u8(x, c16);
			uint8x16_t b = vceqq_u8(y, c16);

			if (isFound2(a, b)) {
				unsigned int mask = GetByteMask2(a,b);
				return (void*)(p + __builtin_clz(mask));
			}
			len -= 32;
			p += 32;
		}
	}

	while (len > 0) {
		if (*p == c) return (void*)p;
		p++;
		len--;
	}
	return 0;
}
コード例 #4
0
        template <bool align> void SquaredDifferenceSumMasked(
            const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride,
            const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum)
        {
            assert(width < 0x10000);
            if (align)
            {
                assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride));
                assert(Aligned(mask) && Aligned(maskStride));
            }

            size_t alignedWidth = Simd::AlignLo(width, A);
            uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedWidth);
            uint8x16_t _index = vdupq_n_u8(index);
            uint64x2_t _sum = K64_0000000000000000;

            for (size_t row = 0; row < height; ++row)
            {
                uint32x4_t rowSum = K32_00000000;
                for (size_t col = 0; col < alignedWidth; col += A)
                {
                    uint8x16_t _mask = vceqq_u8(Load<align>(mask + col), _index);
                    uint8x16_t _a = Load<align>(a + col);
                    uint8x16_t _b = Load<align>(b + col);
                    rowSum = vaddq_u32(rowSum, SquaredDifferenceSumMasked(_a, _b, _mask));
                }
                if (width - alignedWidth)
                {
                    uint8x16_t _mask = vandq_u8(tailMask, vceqq_u8(Load<align>(mask + width - A), _index));
                    uint8x16_t _a = Load<align>(a + width - A);
                    uint8x16_t _b = Load<align>(b + width - A);
                    rowSum = vaddq_u32(rowSum, SquaredDifferenceSumMasked(_a, _b, _mask));
                }
                _sum = vaddq_u64(_sum, vpaddlq_u32(rowSum));
                a += aStride;
                b += bStride;
                mask += maskStride;
            }
            *sum = ExtractSum64u(_sum);
        }
コード例 #5
0
ファイル: vtransform.hpp プロジェクト: 007Indian/opencv
inline  uint8x16_t vceqq(const uint8x16_t  & v0, const uint8x16_t  & v1) { return vceqq_u8 (v0, v1); }