size_t strlenNEON(const char *p) { const char *const top = p; uint8x16_t c16 = vdupq_n_u8(0); /* 16 byte alignment */ size_t n = reinterpret_cast<size_t>(p) & 15; if (n > 0) { uint8x16_t x = *(const uint8x16_t*)&p[-n]; uint8x16_t a = vceqq_u8(x, c16); unsigned long mask = GetByteMask(a) << (16 + n); if (mask) { return __builtin_clz(mask); } p += 16 - n; } assert((reinterpret_cast<size_t>(p) & 15) == 0); for (;;) { uint8x16_t x = *(const uint8x16_t*)&p[0]; uint8x16_t a = vceqq_u8(x, c16); if (isFound(a)) { unsigned int mask = GetByteMask(a); return p + __builtin_clz(mask) - top; } p += 16; } }
void test_vceqQu8 (void) { uint8x16_t out_uint8x16_t; uint8x16_t arg0_uint8x16_t; uint8x16_t arg1_uint8x16_t; out_uint8x16_t = vceqq_u8 (arg0_uint8x16_t, arg1_uint8x16_t); }
void *memchrNEON(const void *ptr, int c, size_t len) { const char *p = reinterpret_cast<const char*>(ptr); if (len >= 16) { uint8x16_t c16 = vdupq_n_u8(static_cast<char>(c)); /* 16 byte alignment */ size_t n = reinterpret_cast<size_t>(p) & 15; if (n > 0) { uint8x16_t x = *(const uint8x16_t*)&p[-n]; uint8x16_t a = vceqq_u8(x, c16); unsigned long mask = GetByteMask(a) << (16 + n); if (mask) { return (void*)(p + __builtin_clz(mask)); } n = 16 - n; len -= n; p += n; } while (len >= 32) { uint8x16_t x = *(const uint8x16_t*)&p[0]; uint8x16_t y = *(const uint8x16_t*)&p[16]; uint8x16_t a = vceqq_u8(x, c16); uint8x16_t b = vceqq_u8(y, c16); if (isFound2(a, b)) { unsigned int mask = GetByteMask2(a,b); return (void*)(p + __builtin_clz(mask)); } len -= 32; p += 32; } } while (len > 0) { if (*p == c) return (void*)p; p++; len--; } return 0; }
template <bool align> void SquaredDifferenceSumMasked( const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, const uint8_t *mask, size_t maskStride, uint8_t index, size_t width, size_t height, uint64_t * sum) { assert(width < 0x10000); if (align) { assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride)); assert(Aligned(mask) && Aligned(maskStride)); } size_t alignedWidth = Simd::AlignLo(width, A); uint8x16_t tailMask = ShiftLeft(K8_FF, A - width + alignedWidth); uint8x16_t _index = vdupq_n_u8(index); uint64x2_t _sum = K64_0000000000000000; for (size_t row = 0; row < height; ++row) { uint32x4_t rowSum = K32_00000000; for (size_t col = 0; col < alignedWidth; col += A) { uint8x16_t _mask = vceqq_u8(Load<align>(mask + col), _index); uint8x16_t _a = Load<align>(a + col); uint8x16_t _b = Load<align>(b + col); rowSum = vaddq_u32(rowSum, SquaredDifferenceSumMasked(_a, _b, _mask)); } if (width - alignedWidth) { uint8x16_t _mask = vandq_u8(tailMask, vceqq_u8(Load<align>(mask + width - A), _index)); uint8x16_t _a = Load<align>(a + width - A); uint8x16_t _b = Load<align>(b + width - A); rowSum = vaddq_u32(rowSum, SquaredDifferenceSumMasked(_a, _b, _mask)); } _sum = vaddq_u64(_sum, vpaddlq_u32(rowSum)); a += aStride; b += bStride; mask += maskStride; } *sum = ExtractSum64u(_sum); }
inline uint8x16_t vceqq(const uint8x16_t & v0, const uint8x16_t & v1) { return vceqq_u8 (v0, v1); }